diff --git a/.devops/cloud-v-pipeline b/.devops/cloud-v-pipeline new file mode 100644 index 0000000000000000000000000000000000000000..f3a4944f8a419fdaaca79296006511e0bbfd0401 --- /dev/null +++ b/.devops/cloud-v-pipeline @@ -0,0 +1,22 @@ +node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries + stage('Cleanup'){ + cleanWs() // Cleaning previous CI build in workspace + } + stage('checkout repo'){ + retry(5){ // Retry if the cloning fails due to some reason + checkout scm // Clone the repo on Runner + } + } + stage('Compiling llama.cpp'){ + sh'''#!/bin/bash + make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V + ''' + } + stage('Running llama.cpp'){ + sh'''#!/bin/bash + module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc + qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 + cat llama_log.txt # Printing results + ''' + } +} diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index e5fcb37d6fe7af16d06c6434f4b43f5792b83f3b..360602d6567b8a835f23e7365cf222ca7299c1d7 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build ARG CUDA_DOCKER_ARCH=all RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip + apt-get install -y build-essential python3 python3-pip git COPY requirements.txt requirements.txt diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6c521e9b4101fe067d7018ca469f3dc2b994c768 --- /dev/null +++ b/.devops/full-rocm.Dockerfile @@ -0,0 +1,44 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV LLAMA_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN make + +ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec new file mode 100644 index 0000000000000000000000000000000000000000..076f29695dc0a9ada284baa6eeea802377560489 --- /dev/null +++ b/.devops/llama-cpp-clblast.srpm.spec @@ -0,0 +1,84 @@ +# SRPM for building from source and packaging an RPM for RPM-based distros. +# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# Built and maintained by John Boero - boeroboy@gmail.com +# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal + +# Notes for llama.cpp: +# 1. Tags are currently based on hash - which will not sort asciibetically. +# We need to declare standard versioning if people want to sort latest releases. +# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies. +# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed. +# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo +# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. +# It is up to the user to install the correct vendor-specific support. + +Name: llama.cpp-clblast +Version: %( date "+%%Y%%m%%d" ) +Release: 1%{?dist} +Summary: OpenCL Inference of LLaMA model in C/C++ +License: MIT +Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel +Requires: clblast +URL: https://github.com/ggerganov/llama.cpp + +%define debug_package %{nil} +%define source_date_epoch_from_changelog 0 + +%description +CPU inference for Meta's Lllama2 models using default options. + +%prep +%setup -n llama.cpp-master + +%build +make -j LLAMA_CLBLAST=1 + +%install +mkdir -p %{buildroot}%{_bindir}/ +cp -p main %{buildroot}%{_bindir}/llamaclblast +cp -p server %{buildroot}%{_bindir}/llamaclblastserver +cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple + +mkdir -p %{buildroot}/usr/lib/systemd/system +%{__cat} < %{buildroot}/usr/lib/systemd/system/llamaclblast.service +[Unit] +Description=Llama.cpp server, CPU only (no GPU support in this build). +After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target + +[Service] +Type=simple +EnvironmentFile=/etc/sysconfig/llama +ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS +ExecReload=/bin/kill -s HUP $MAINPID +Restart=never + +[Install] +WantedBy=default.target +EOF + +mkdir -p %{buildroot}/etc/sysconfig +%{__cat} < %{buildroot}/etc/sysconfig/llama +LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" +EOF + +%clean +rm -rf %{buildroot} +rm -rf %{_builddir}/* + +%files +%{_bindir}/llamaclblast +%{_bindir}/llamaclblastserver +%{_bindir}/llamaclblastsimple +/usr/lib/systemd/system/llamaclblast.service +%config /etc/sysconfig/llama + + +%pre + +%post + +%preun +%postun + +%changelog diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cublas.srpm.spec new file mode 100644 index 0000000000000000000000000000000000000000..f847ebb1e86134b943825be2537b04c21aa50f6f --- /dev/null +++ b/.devops/llama-cpp-cublas.srpm.spec @@ -0,0 +1,83 @@ +# SRPM for building from source and packaging an RPM for RPM-based distros. +# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# Built and maintained by John Boero - boeroboy@gmail.com +# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal + +# Notes for llama.cpp: +# 1. Tags are currently based on hash - which will not sort asciibetically. +# We need to declare standard versioning if people want to sort latest releases. +# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies. +# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed. +# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo +# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. +# It is up to the user to install the correct vendor-specific support. + +Name: llama.cpp-cublas +Version: %( date "+%%Y%%m%%d" ) +Release: 1%{?dist} +Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) +License: MIT +Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +BuildRequires: coreutils make gcc-c++ git cuda-toolkit +Requires: cuda-toolkit +URL: https://github.com/ggerganov/llama.cpp + +%define debug_package %{nil} +%define source_date_epoch_from_changelog 0 + +%description +CPU inference for Meta's Lllama2 models using default options. + +%prep +%setup -n llama.cpp-master + +%build +make -j LLAMA_CUBLAS=1 + +%install +mkdir -p %{buildroot}%{_bindir}/ +cp -p main %{buildroot}%{_bindir}/llamacppcublas +cp -p server %{buildroot}%{_bindir}/llamacppcublasserver +cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple + +mkdir -p %{buildroot}/usr/lib/systemd/system +%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacublas.service +[Unit] +Description=Llama.cpp server, CPU only (no GPU support in this build). +After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target + +[Service] +Type=simple +EnvironmentFile=/etc/sysconfig/llama +ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS +ExecReload=/bin/kill -s HUP $MAINPID +Restart=never + +[Install] +WantedBy=default.target +EOF + +mkdir -p %{buildroot}/etc/sysconfig +%{__cat} < %{buildroot}/etc/sysconfig/llama +LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" +EOF + +%clean +rm -rf %{buildroot} +rm -rf %{_builddir}/* + +%files +%{_bindir}/llamacppcublas +%{_bindir}/llamacppcublasserver +%{_bindir}/llamacppcublassimple +/usr/lib/systemd/system/llamacublas.service +%config /etc/sysconfig/llama + +%pre + +%post + +%preun +%postun + +%changelog diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec new file mode 100644 index 0000000000000000000000000000000000000000..446213d6995e23089250b50c00ef6fc5a38b882b --- /dev/null +++ b/.devops/llama-cpp.srpm.spec @@ -0,0 +1,85 @@ +# SRPM for building from source and packaging an RPM for RPM-based distros. +# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# Built and maintained by John Boero - boeroboy@gmail.com +# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal + +# Notes for llama.cpp: +# 1. Tags are currently based on hash - which will not sort asciibetically. +# We need to declare standard versioning if people want to sort latest releases. +# In the meantime, YYYYMMDD format will be used. +# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies. +# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed. +# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo +# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. +# It is up to the user to install the correct vendor-specific support. + +Name: llama.cpp +Version: %( date "+%%Y%%m%%d" ) +Release: 1%{?dist} +Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) +License: MIT +Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +BuildRequires: coreutils make gcc-c++ git libstdc++-devel +Requires: libstdc++ +URL: https://github.com/ggerganov/llama.cpp + +%define debug_package %{nil} +%define source_date_epoch_from_changelog 0 + +%description +CPU inference for Meta's Lllama2 models using default options. +Models are not included in this package and must be downloaded separately. + +%prep +%setup -n llama.cpp-master + +%build +make -j + +%install +mkdir -p %{buildroot}%{_bindir}/ +cp -p main %{buildroot}%{_bindir}/llama +cp -p server %{buildroot}%{_bindir}/llamaserver +cp -p simple %{buildroot}%{_bindir}/llamasimple + +mkdir -p %{buildroot}/usr/lib/systemd/system +%{__cat} < %{buildroot}/usr/lib/systemd/system/llama.service +[Unit] +Description=Llama.cpp server, CPU only (no GPU support in this build). +After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target + +[Service] +Type=simple +EnvironmentFile=/etc/sysconfig/llama +ExecStart=/usr/bin/llamaserver $LLAMA_ARGS +ExecReload=/bin/kill -s HUP $MAINPID +Restart=never + +[Install] +WantedBy=default.target +EOF + +mkdir -p %{buildroot}/etc/sysconfig +%{__cat} < %{buildroot}/etc/sysconfig/llama +LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" +EOF + +%clean +rm -rf %{buildroot} +rm -rf %{_builddir}/* + +%files +%{_bindir}/llama +%{_bindir}/llamaserver +%{_bindir}/llamasimple +/usr/lib/systemd/system/llama.service +%config /etc/sysconfig/llama + +%pre + +%post + +%preun +%postun + +%changelog diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index 30c01196ab520773d949f02a82ce6b14c2f89674..2b7faf7c11c0bbf3ea039413a29219c6fa92a5bb 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build ARG CUDA_DOCKER_ARCH=all RUN apt-get update && \ - apt-get install -y build-essential + apt-get install -y build-essential git WORKDIR /app diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..789deff6dc8c1d44a308d6631af6e7d845641372 --- /dev/null +++ b/.devops/main-rocm.Dockerfile @@ -0,0 +1,44 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV LLAMA_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN make + +ENTRYPOINT [ "/app/main" ] diff --git a/.editorconfig b/.editorconfig index 135a7e4bce5a168e13818ac47da04a693ce8ff8d..f8245b85c6c57e711ff5669fcf97ec9ed492af33 100644 --- a/.editorconfig +++ b/.editorconfig @@ -17,3 +17,6 @@ indent_style = tab [prompts/*.txt] insert_final_newline = unset + +[examples/server/public/*] +indent_size = 2 diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml new file mode 100644 index 0000000000000000000000000000000000000000..392db8a089ac5a1c667c677bd5c8f836d8f46455 --- /dev/null +++ b/.github/workflows/code-coverage.yml @@ -0,0 +1,36 @@ +name: Code Coverage +on: [push, pull_request] + +env: + GGML_NLOOP: 3 + GGML_N_THREADS: 1 + +jobs: + run: + runs-on: ubuntu-20.04 + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install build-essential gcc-8 lcov + + - name: Build + run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests + + - name: Run tests + run: CC=gcc-8 make test + + - name: Generate coverage report + run: | + make coverage + make lcov-report + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + with: + files: lcov-report/coverage.info diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml new file mode 100644 index 0000000000000000000000000000000000000000..e61bfc6c32b608af29452380b3236f99bec9aa55 --- /dev/null +++ b/.github/workflows/gguf-publish.yml @@ -0,0 +1,43 @@ +# This workflow will upload a Python Package using Twine when a GGUF release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# See `gguf-py/README.md` for how to make a release. + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + workflow_dispatch: + push: + # Pattern matched against refs/tags + tags: + - 'gguf-v*' # Push events to every version tag + + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9.x' + - name: Install dependencies + run: | + cd gguf-py + python -m pip install poetry + poetry install + + - name: Build package + run: poetry build + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index 043cbe7fbfebbadf7adb95ee0dfb136cf50fbac9..1a87853fd635faaaab57453deb82fd1dc2410712 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.o *.a -*.so +*.bin .DS_Store .build/ .cache/ @@ -12,20 +12,7 @@ .vs/ .vscode/ -build/ -build-em/ -build-debug/ -build-release/ -build-ci-debug/ -build-ci-release/ -build-static/ -build-cublas/ -build-opencl/ -build-metal/ -build-mpi/ -build-no-accel/ -build-sanitize-addr/ -build-sanitize-thread/ +build*/ out/ tmp/ @@ -39,19 +26,24 @@ models-mnt /perplexity /embedding /train-text-from-scratch +/convert-llama2c-to-ggml /simple /benchmark-matmult /vdot /server /Pipfile /embd-input-test +/gguf +/gguf-llama-simple /libllama.so - +/llama-bench +build-info.h arm_neon.h compile_commands.json CMakeSettings.json __pycache__ +dist dist/ *.spec @@ -65,11 +57,11 @@ perf-*.txt examples/jeopardy/results.txt -pyproject.toml poetry.lock poetry.toml # Test binaries +tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt @@ -78,16 +70,21 @@ tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0 -koboldcpp.so -koboldcpp_failsafe.so -koboldcpp_openblas.so -koboldcpp_noavx2.so -koboldcpp_clblast.so -koboldcpp.dll -koboldcpp_failsafe.dll -koboldcpp_openblas.dll -koboldcpp_noavx2.dll -koboldcpp_clblast.dll -koboldcpp_cublas.dll -cublas64_11.dll -cublasLt64_11.dll +/koboldcpp_default.so +/koboldcpp_failsafe.so +/koboldcpp_openblas.so +/koboldcpp_noavx2.so +/koboldcpp_clblast.so +/koboldcpp_cublas.so +/koboldcpp_default.dll +/koboldcpp_failsafe.dll +/koboldcpp_openblas.dll +/koboldcpp_noavx2.dll +/koboldcpp_clblast.dll +/koboldcpp_cublas.dll +/cublas64_11.dll +/cublasLt64_11.dll +/rocblas/ +rocblas.dll +hipblas.dll +koboldcpp_hipblas.so diff --git a/CMakeLists.txt b/CMakeLists.txt index 103abe3101744bf32b53c5701adbefca28adfb9f..6efba527da8cd0397a0f9c7679400dac84bf0514 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,9 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") +set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING + "llama: max. batch size for using peer access") +option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) @@ -65,6 +68,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) add_compile_definitions(GGML_USE_K_QUANTS) +add_compile_definitions(LOG_DISABLE_LOGS) + +set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) +set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h) +set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h) if (LLAMA_CUBLAS) cmake_minimum_required(VERSION 3.17) @@ -75,10 +83,6 @@ if (LLAMA_CUBLAS) enable_language(CUDA) - set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) - set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h) - set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h) - add_compile_definitions(GGML_USE_CUBLAS) #add_compile_definitions(GGML_CUDA_CUBLAS) #remove to not use cublas add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) @@ -91,6 +95,7 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_CUDA_F16) endif() add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) if (LLAMA_STATIC) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) @@ -121,6 +126,75 @@ if (LLAMA_CUBLAS) endif() endif() +if (LLAMA_HIPBLAS) + if (MSVC) + list(APPEND CMAKE_PREFIX_PATH "C:/Program Files/AMD/ROCm/5.5") + else() + list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + endif() + + + if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") + endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + + find_package(hip) + find_package(hipblas) + find_package(rocblas) + + if (${hipblas_FOUND} AND ${hip_FOUND}) + message(STATUS "HIP and hipBLAS found") + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA}) + if (LLAMA_CUDA_FORCE_DMMV) + target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV) + endif() + target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) + target_compile_definitions(ggml-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + target_compile_definitions(ggml-rocm PUBLIC CC_TURING=1000000000) + set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas) + + + add_library(ggml-v2-rocm OBJECT ${GGML_V2_CUDA_SOURCES}) + if (LLAMA_CUDA_FORCE_DMMV) + target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_FORCE_DMMV) + endif() + target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) + target_compile_definitions(ggml-v2-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + target_compile_definitions(ggml-v2-rocm PUBLIC CC_TURING=1000000000) + set_source_files_properties(otherarch/ggml_v2-cuda.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-v2-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas) + + + add_library(ggml-v2-legacy-rocm OBJECT ${GGML_V2_LEGACY_CUDA_SOURCES}) + if (LLAMA_CUDA_FORCE_DMMV) + target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_FORCE_DMMV) + endif() + target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) + target_compile_definitions(ggml-v2-legacy-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + target_compile_definitions(ggml-v2-legacy-rocm PUBLIC CC_TURING=1000000000) + set_source_files_properties(otherarch/ggml_v2-cuda-legacy.cu PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-v2-legacy-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas) + + + + + if (LLAMA_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm ggml-v2-rocm ggml-v2-legacy-rocm) + else() + message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags @@ -133,15 +207,22 @@ if (LLAMA_ALL_WARNINGS) -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int + -Wno-unused-function ) set(cxx_flags -Wall -Wextra -Wpedantic -Wcast-qual + -Wmissing-declarations -Wno-unused-function -Wno-multichar ) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # g++ only + set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds) + endif() else() # todo : msvc endif() @@ -153,7 +234,7 @@ if (LLAMA_ALL_WARNINGS) endif() -if (MSVC) +if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) if (BUILD_SHARED_LIBS) @@ -190,7 +271,7 @@ if (NOT MSVC) endif() endif() -if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") +if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")) message(STATUS "ARM detected") if (MSVC) # TODO: arm msvc? @@ -301,37 +382,52 @@ target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(common2 - examples/common.cpp - examples/common.h) -target_include_directories(common2 PUBLIC . ./otherarch ./otherarch/tools ./examples) + common/common.cpp + common/common.h + common/grammar-parser.h + common/grammar-parser.cpp) +target_include_directories(common2 PUBLIC . ./otherarch ./otherarch/tools ./examples ./common) target_compile_features(common2 PUBLIC cxx_std_11) # don't bump target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(gpttype_adapter gpttype_adapter.cpp) -target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples) +target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples ./common) target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) +if (LLAMA_CUBLAS) + set(TARGET koboldcpp_cublas) + add_library(${TARGET} SHARED expose.cpp expose.h) + target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common) + target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump + set_target_properties(${TARGET} PROPERTIES PREFIX "") + set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS}) + target_compile_features(${TARGET} PRIVATE cxx_std_11) +endif() -set(TARGET koboldcpp_cublas) -add_library(${TARGET} SHARED expose.cpp expose.h) -target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples) -target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump -set_target_properties(${TARGET} PROPERTIES PREFIX "") -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") -set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) -target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +if (LLAMA_HIPBLAS) + set(TARGET koboldcpp_hipblas) + add_library(${TARGET} SHARED expose.cpp expose.h) + target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common) + target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump + set_target_properties(${TARGET} PROPERTIES PREFIX "") + set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS}) + target_compile_features(${TARGET} PRIVATE cxx_std_11) +endif() if (MAKE_MISC_FILES) +add_subdirectory(common) add_library(llama llama.cpp llama.h - llama-util.h ) target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump diff --git a/Dockerfile b/Dockerfile index ea079c284cef81a8a85f08fe2b5c6a8fb2281624..b404bab4bf11089cb7d26ad66ad9d29a9384fb24 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,7 @@ RUN apt update \ && apt install build-essential wget libopenblas-dev make -y \ && make LLAMA_OPENBLAS=1 \ && wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \ - && apt remove build-essential wget make -y + && apt remove build-essential wget make -y \ + && rm -fr *.bat convert-* ci docs examples otherarchs tests ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-13b-ggml-q4_0.bin", "--port", "7860"] \ No newline at end of file diff --git a/MIT_LICENSE_GGML_LLAMACPP_ONLY b/MIT_LICENSE_GGML_LLAMACPP_ONLY index 252a81b36bafd202c4e908dab6ba184709ab557e..22fd48829d6b9e1c0a3ab9c47de975b3cd4dded3 100644 --- a/MIT_LICENSE_GGML_LLAMACPP_ONLY +++ b/MIT_LICENSE_GGML_LLAMACPP_ONLY @@ -23,4 +23,4 @@ SOFTWARE. =================================== Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License -Kobold Lite by Concedo and the provided python ctypes bindings in koboldcpp.dll are licensed under the AGPL v3.0 License \ No newline at end of file +Kobold Lite by Concedo and the provided python ctypes bindings in koboldcpp dlls are licensed under the AGPL v3.0 License \ No newline at end of file diff --git a/Makefile b/Makefile index 8b0693dd303802f06ae45d5a9b8dece8d37842fe..440ff611279f4213ae3f555563a423e980ee42c6 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas +default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas koboldcpp_hipblas tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt dev: koboldcpp_openblas dev2: koboldcpp_clblast @@ -20,8 +20,6 @@ ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/nul ARCH_ADD = -lcblas endif -CCV := $(shell $(CC) --version | head -n 1) -CXXV := $(shell $(CXX) --version | head -n 1) # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 @@ -41,8 +39,8 @@ endif # # keep standard at C11 and C++11 -CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS +CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE +CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE LDFLAGS = # these are used on windows, to build some libraries with extra old device compatibility @@ -110,7 +108,8 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) # old library NEEDS mf16c to work. so we must build with it. new one doesnt ifeq ($(OS),Windows_NT) CFLAGS += - NONECFLAGS += -mno-sse3 + NONECFLAGS += +# -mno-sse3 SIMPLECFLAGS += -mavx -msse3 FULLCFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx else @@ -195,6 +194,42 @@ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-l $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS +ifdef LLAMA_HIPBLAS + ROCM_PATH ?= /opt/rocm + CC := $(ROCM_PATH)/llvm/bin/clang + CXX := $(ROCM_PATH)/llvm/bin/clang++ + GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) + LLAMA_CUDA_DMMV_X ?= 32 + LLAMA_CUDA_MMV_Y ?= 2 + LLAMA_CUDA_KQUANTS_ITER ?= 2 + HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C) +ifdef LLAMA_CUDA_FORCE_DMMV + HIPFLAGS += -DGGML_CUDA_FORCE_DMMV +endif # LLAMA_CUDA_FORCE_DMMV + HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas + HIP_OBJS += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o +ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ + -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ + -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +ggml_v2-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ + -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ + -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +ggml_v2-cuda-legacy.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \ + -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \ + -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \ + -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< +ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h + $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< +ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h + $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< +endif # LLAMA_HIPBLAS + + + ifdef LLAMA_METAL CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG CXXFLAGS += -DGGML_USE_METAL @@ -224,12 +259,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif +CCV := $(shell $(CC) --version | head -n 1) +CXXV := $(shell $(CXX) --version | head -n 1) + DEFAULT_BUILD = FAILSAFE_BUILD = OPENBLAS_BUILD = NOAVX2_BUILD = CLBLAST_BUILD = CUBLAS_BUILD = +HIPBLAS_BUILD = ifeq ($(OS),Windows_NT) DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) @@ -238,10 +277,12 @@ ifeq ($(OS),Windows_NT) NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS) -ifdef LLAMA_CUBLAS - CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS) -endif - + ifdef LLAMA_CUBLAS + CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS) + endif + ifdef LLAMA_HIPBLAS + HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.dll $(HIPLDFLAGS) $(LDFLAGS) + endif else DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) @@ -250,24 +291,29 @@ else NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif ifdef LLAMA_CLBLAST - ifeq ($(UNAME_S),Darwin) - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - else - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - endif + ifeq ($(UNAME_S),Darwin) + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + else + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + endif endif -ifdef LLAMA_CUBLAS - CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS) -endif + ifdef LLAMA_CUBLAS + CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS) + endif + ifdef LLAMA_HIPBLAS + HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.so $(HIPLDFLAGS) $(LDFLAGS) + endif ifndef LLAMA_OPENBLAS ifndef LLAMA_CLBLAST ifndef LLAMA_CUBLAS + ifndef LLAMA_HIPBLAS OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.' endif endif endif + endif endif @@ -293,16 +339,16 @@ $(info ) ggml.o: ggml.c ggml.h ggml-cuda.h k_quants.h $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ -ggml_openblas.o: ggml.c ggml.h +ggml_openblas.o: ggml.c ggml.h ggml-cuda.h k_quants.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ -ggml_failsafe.o: ggml.c ggml.h +ggml_failsafe.o: ggml.c ggml.h ggml-cuda.h k_quants.h $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ -ggml_noavx2.o: ggml.c ggml.h +ggml_noavx2.o: ggml.c ggml.h ggml-cuda.h k_quants.h $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ -ggml_clblast.o: ggml.c ggml.h +ggml_clblast.o: ggml.c ggml.h ggml-cuda.h k_quants.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ -ggml_cublas.o: ggml.c ggml.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ +ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h + $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ #quants K k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h @@ -328,7 +374,7 @@ ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ + $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ #extreme old version compat ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h @@ -345,19 +391,19 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope $(CC) $(CFLAGS) -c $< -o $@ # intermediate objects -llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h +llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common.o: examples/common.cpp examples/common.h +common.o: common/common.cpp common/common.h common/log.h $(CXX) $(CXXFLAGS) -c $< -o $@ -console.o: examples/console.cpp examples/console.h +console.o: common/console.cpp common/console.h $(CXX) $(CXXFLAGS) -c $< -o $@ -grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h +grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h $(CXX) $(CXXFLAGS) -c $< -o $@ expose.o: expose.cpp expose.h $(CXX) $(CXXFLAGS) -c $< -o $@ # idiotic "for easier compilation" -GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml.h ggml-cuda.h llama.h llama-util.h +GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml.h ggml-cuda.h llama.h otherarch/llama-util.h gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@ gpttype_adapter.o: $(GPTTYPE_ADAPTER) @@ -365,10 +411,10 @@ gpttype_adapter.o: $(GPTTYPE_ADAPTER) gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER) - $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ + $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ clean: - rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so + rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @@ -376,19 +422,24 @@ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o @echo '==== Run ./main -h for help. ====' @echo +gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + #generated libraries -koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS) +koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) $(DEFAULT_BUILD) -koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS) +koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) $(OPENBLAS_BUILD) -koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o $(OBJS) +koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS) $(FAILSAFE_BUILD) -koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o $(OBJS) +koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS) $(NOAVX2_BUILD) -koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o $(OBJS) +koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS) $(CLBLAST_BUILD) -koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(OBJS) +koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS) $(CUBLAS_BUILD) +koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS) + $(HIPBLAS_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index 73d027c70215495a5b5233b95110a2c3f622d46c..fb95ef7ebc59f05ffa63c48e74eefd36f098fe65 100644 --- a/Package.swift +++ b/Package.swift @@ -2,8 +2,30 @@ import PackageDescription +#if arch(arm) || arch(arm64) +let platforms: [SupportedPlatform]? = [ + .macOS(.v11), + .iOS(.v14), + .watchOS(.v4), + .tvOS(.v14) +] +let exclude: [String] = [] +let additionalSources: [String] = ["ggml-metal.m"] +let additionalSettings: [CSetting] = [ + .unsafeFlags(["-fno-objc-arc"]), + .define("GGML_SWIFT"), + .define("GGML_USE_METAL") +] +#else +let platforms: [SupportedPlatform]? = nil +let exclude: [String] = ["ggml-metal.metal"] +let additionalSources: [String] = [] +let additionalSettings: [CSetting] = [] +#endif + let package = Package( name: "llama", + platforms: platforms, products: [ .library(name: "llama", targets: ["llama"]), ], @@ -11,14 +33,23 @@ let package = Package( .target( name: "llama", path: ".", - exclude: ["ggml-metal.metal"], - sources: ["ggml.c", "llama.cpp"], + exclude: exclude, + sources: [ + "ggml.c", + "llama.cpp", + "ggml-alloc.c", + "k_quants.c", + ] + additionalSources, publicHeadersPath: "spm-headers", - cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")], + cSettings: [ + .unsafeFlags(["-Wno-shorten-64-to-32"]), + .define("GGML_USE_K_QUANTS"), + .define("GGML_USE_ACCELERATE") + ] + additionalSettings, linkerSettings: [ .linkedFramework("Accelerate") ] - ), + ) ], cxxLanguageStandard: .cxx11 ) diff --git a/README.md b/README.md index e1ca814f99b674cfe002deb9621dd8a9ab1421b3..77ffe1d11702f82883809a3e0f3a20082140398f 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,4 @@ sdk: docker emoji: 🚀 colorFrom: yellow colorTo: blue ---- +--- \ No newline at end of file diff --git a/build-info.h b/build-info.h index 70a4db739b776892555e4e7d2b245986b43298b8..6428ea9753921c461b9c31aa5693a79579a37cfe 100644 --- a/build-info.h +++ b/build-info.h @@ -3,5 +3,7 @@ #define BUILD_NUMBER 999 #define BUILD_COMMIT "KOBOLDCPP" +#define BUILD_COMPILER "KCPP" +#define BUILD_TARGET "KCPP" #endif // BUILD_INFO_H diff --git a/ci/run.sh b/ci/run.sh index 8dc3949648ff9620b1ca86ec8e4888649c618227..942b2e00cec4b76befe28909174565fa8b69c941 100644 --- a/ci/run.sh +++ b/ci/run.sh @@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 { python3 ../convert.py ${path_models} - model_f16="${path_models}/ggml-model-f16.bin" - model_q8_0="${path_models}/ggml-model-q8_0.bin" - model_q4_0="${path_models}/ggml-model-q4_0.bin" - model_q4_1="${path_models}/ggml-model-q4_1.bin" - model_q5_0="${path_models}/ggml-model-q5_0.bin" - model_q5_1="${path_models}/ggml-model-q5_1.bin" - model_q2_k="${path_models}/ggml-model-q2_k.bin" - model_q3_k="${path_models}/ggml-model-q3_k.bin" - model_q4_k="${path_models}/ggml-model-q4_k.bin" - model_q5_k="${path_models}/ggml-model-q5_k.bin" - model_q6_k="${path_models}/ggml-model-q6_k.bin" + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + model_q4_0="${path_models}/ggml-model-q4_0.gguf" + model_q4_1="${path_models}/ggml-model-q4_1.gguf" + model_q5_0="${path_models}/ggml-model-q5_0.gguf" + model_q5_1="${path_models}/ggml-model-q5_1.gguf" + model_q2_k="${path_models}/ggml-model-q2_k.gguf" + model_q3_k="${path_models}/ggml-model-q3_k.gguf" + model_q4_k="${path_models}/ggml-model-q4_k.gguf" + model_q5_k="${path_models}/ggml-model-q5_k.gguf" + model_q6_k="${path_models}/ggml-model-q6_k.gguf" wiki_test_60="${path_wiki}/wiki.test-60.raw" @@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 { (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log function check_ppl { qnt="$1" @@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + # lora + function compare_ppl { + qnt="$1" + ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2" + return 20 + fi + + printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2" + return 0 + } + + path_lora="../models-mnt/open-llama/3B-v2/lora" + path_shakespeare="../models-mnt/shakespeare" + + shakespeare="${path_shakespeare}/shakespeare.txt" + lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt + + python3 ../convert-lora-to-ggml.py ${path_lora} + + # f16 + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + f16 lora-base + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + set +e } @@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 { gg_printf 'OpenLLaMA 3B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" @@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" + gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" + gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" + gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)" + gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } # open_llama_7b_v2 @@ -285,17 +333,17 @@ function gg_run_open_llama_7b_v2 { python3 ../convert.py ${path_models} - model_f16="${path_models}/ggml-model-f16.bin" - model_q8_0="${path_models}/ggml-model-q8_0.bin" - model_q4_0="${path_models}/ggml-model-q4_0.bin" - model_q4_1="${path_models}/ggml-model-q4_1.bin" - model_q5_0="${path_models}/ggml-model-q5_0.bin" - model_q5_1="${path_models}/ggml-model-q5_1.bin" - model_q2_k="${path_models}/ggml-model-q2_k.bin" - model_q3_k="${path_models}/ggml-model-q3_k.bin" - model_q4_k="${path_models}/ggml-model-q4_k.bin" - model_q5_k="${path_models}/ggml-model-q5_k.bin" - model_q6_k="${path_models}/ggml-model-q6_k.bin" + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + model_q4_0="${path_models}/ggml-model-q4_0.gguf" + model_q4_1="${path_models}/ggml-model-q4_1.gguf" + model_q5_0="${path_models}/ggml-model-q5_0.gguf" + model_q5_1="${path_models}/ggml-model-q5_1.gguf" + model_q2_k="${path_models}/ggml-model-q2_k.gguf" + model_q3_k="${path_models}/ggml-model-q3_k.gguf" + model_q4_k="${path_models}/ggml-model-q4_k.gguf" + model_q5_k="${path_models}/ggml-model-q5_k.gguf" + model_q6_k="${path_models}/ggml-model-q6_k.gguf" wiki_test="${path_wiki}/wiki.test.raw" @@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 { ./bin/quantize ${model_f16} ${model_q5_k} q5_k ./bin/quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log @@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + # lora + function compare_ppl { + qnt="$1" + ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2" + return 20 + fi + + printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2" + return 0 + } + + path_lora="../models-mnt/open-llama/7B-v2/lora" + path_shakespeare="../models-mnt/shakespeare" + + shakespeare="${path_shakespeare}/shakespeare.txt" + lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt + + python3 ../convert-lora-to-ggml.py ${path_lora} + + # f16 + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # currently not supported by the CUDA backend + # q8_0 + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + f16 lora-base + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + set +e } @@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 { gg_printf 'OpenLLaMA 7B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" @@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" + gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" + #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" + #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)" + #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } ## main @@ -391,6 +487,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then ln -sfn ${mnt_models} ${SRC}/models-mnt python3 -m pip install -r ${SRC}/requirements.txt + python3 -m pip install --editable gguf-py fi ret=0 diff --git a/class.py b/class.py new file mode 100644 index 0000000000000000000000000000000000000000..fda2236a879699df984a160c07737df58ccedfd2 --- /dev/null +++ b/class.py @@ -0,0 +1,313 @@ +## KoboldCpp based GGML Backend by Concedo +## For use as a custom backend in KoboldAI United +## Not intended for general use. + +from __future__ import annotations + +import time, json +import torch +import requests +import numpy as np +from typing import List, Optional, Union +import os +from . import koboldcpp + +import utils +from logger import logger +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, +) + +model_backend_name = "koboldcpp" #specific instead of ggml +model_backend_type = "ggml" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face) + +kcpp_backend_loaded = False + +class KoboldCppException(Exception): + """To be used for errors on cpp side of KoboldCpp.""" + +class KcppArgsObject: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + +class model_backend(InferenceModel): + def __init__(self) -> None: + super().__init__() + + def is_valid(self, model_name, model_path, menu_path): + + foundfile = False + try: + files = os.listdir(model_path) + foundfile = len([filename for filename in files if (("ggml" in filename.lower() and ".bin" in filename.lower()) or ".gguf" in filename.lower())])>0 + except: + pass + return foundfile + + def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): + + self.kcpp_threads = 5 + self.model_name = "GGML_Model" + self.kcpp_ctxsize = 2048 + self.kcpp_blasbatchsize = 512 + self.kcpp_gpulayers = 0 + self.kcpp_smartcontext = False + self.kcpp_ropescale = 0.0 + self.kcpp_ropebase = 10000.0 + self.kcpp_useclblast = None + self.kcpp_usecublas = None + self.kcpp_noblas = False + self.kcpp_noavx2 = False + self.kcpp_nommap = False + self.kcpp_debugmode = 0 + self.kcpp_tensor_split_str = "" + self.kcpp_tensor_split = None + + files = os.listdir(model_path) + foundfiles = [filename for filename in files if (("ggml" in filename.lower() and ".bin" in filename.lower()) or ".gguf" in filename.lower())] + + requested_parameters = [] + foldermdls = [] + for ff in foundfiles: + foldermdls.append({'text': ff, 'value': os.path.join(model_path, ff)}) + requested_parameters.append({ + "uitype": "dropdown", + "unit": "string", + "label": "GGML DataFile Name", + "id": "kcpp_filename", + "default": os.path.join(model_path, foundfiles[0]) if len(foundfiles)>0 else model_name, + "check": {"value": "", 'check': "!="}, + "tooltip": "Actual GGML DataFile Name", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "", + 'children': foldermdls + }) + requested_parameters.append({ + "uitype": "dropdown", + "unit": "int", + "label": "KoboldCpp Accelerator", + "id": "kcpp_accelerator", + "default": 0, + "check": {"value": "", 'check': "!="}, + 'multiple': False, + "tooltip": "KoboldCpp Accelerator", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "", + 'children': [{'text': 'Use No BLAS', 'value': 0}, {'text': 'Use OpenBLAS', 'value': 1}, {'text': 'Use CuBLAS', 'value': 2}, + {'text': 'Use CLBLast GPU #1', 'value': 3},{'text': 'Use CLBLast GPU #2', 'value': 4},{'text': 'Use CLBLast GPU #3', 'value': 5} + ,{'text': 'NoAVX2 Mode (Old CPU)', 'value': 6},{'text': 'Failsafe Mode (Old CPU)', 'value': 7}], + }) + requested_parameters.append({ + "uitype": "text", + "unit": "int", + "label": "Threads", + "id": "kcpp_threads", + "default": self.kcpp_threads, + "check": {"value": "", 'check': "!="}, + "tooltip": "Thread Count", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "" + }) + + requested_parameters.append({ + "uitype": "text", + "unit": "int", + "label": "Max Context Size", + "id": "kcpp_ctxsize", + "default": self.kcpp_ctxsize, + "check": {"value": "", 'check': "!="}, + "tooltip": "Max Context Size", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "" + }) + requested_parameters.append({ + "uitype": "text", + "unit": "int", + "label": "BLAS Batch Size", + "id": "kcpp_blasbatchsize", + "default": self.kcpp_blasbatchsize, + "check": {"value": "", 'check': "!="}, + "tooltip": "BLAS Batch Size", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "" + }) + requested_parameters.append({ + "uitype": "text", + "unit": "int", + "label": "GPU Layers", + "id": "kcpp_gpulayers", + "default": self.kcpp_gpulayers, + "check": {"value": "", 'check': "!="}, + "tooltip": "GPU Layers", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "" + }) + requested_parameters.append({ + "uitype": "text", + "unit": "int", + "label": "Rope Scale", + "id": "kcpp_ropescale", + "default": self.kcpp_ropescale, + "check": {"value": "", 'check': "!="}, + "tooltip": "Rope Scale", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "" + }) + requested_parameters.append({ + "uitype": "text", + "unit": "int", + "label": "Rope Base", + "id": "kcpp_ropebase", + "default": self.kcpp_ropebase, + "check": {"value": "", 'check': "!="}, + "tooltip": "Rope Base", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "" + }) + requested_parameters.append({ + "uitype": "dropdown", + "unit": "int", + "label": "Smart Context", + "id": "kcpp_smartcontext", + "default": self.kcpp_smartcontext, + "check": {"value": "", 'check': "!="}, + 'multiple': False, + "tooltip": "Smart Context", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "", + 'children': [{'text': 'False', 'value': False}, {'text': 'True', 'value': True}], + }) + requested_parameters.append({ + "uitype": "dropdown", + "unit": "int", + "label": "Debug Mode", + "id": "kcpp_debugmode", + "default": self.kcpp_debugmode, + "check": {"value": "", 'check': "!="}, + 'multiple': False, + "tooltip": "Debug Mode", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "", + 'children': [{'text': 'False', 'value': 0}, {'text': 'True', 'value': 1}], + }) + requested_parameters.append({ + "uitype": "text", + "unit": "text", + "label": "Tensor Split", + "id": "kcpp_tensor_split_str", + "default": self.kcpp_tensor_split_str, + "check": {"value": "", 'check': "!="}, + "tooltip": "Tensor Split, values are space separated", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "" + }) + return requested_parameters + + def set_input_parameters(self, parameters): + self.kcpp_threads = parameters["kcpp_threads"] + self.kcpp_filename = parameters["kcpp_filename"] + self.kcpp_ctxsize = parameters["kcpp_ctxsize"] + self.kcpp_blasbatchsize = parameters["kcpp_blasbatchsize"] + self.kcpp_gpulayers = parameters["kcpp_gpulayers"] + self.kcpp_smartcontext = parameters["kcpp_smartcontext"] + self.kcpp_ropescale = parameters["kcpp_ropescale"] + self.kcpp_ropebase = parameters["kcpp_ropebase"] + self.kcpp_debugmode = parameters["kcpp_debugmode"] + self.kcpp_tensor_split_str = parameters["kcpp_tensor_split_str"] + if self.kcpp_tensor_split_str and self.kcpp_tensor_split_str!="": + splits = self.kcpp_tensor_split_str.split() + self.kcpp_tensor_split = [] + for s in splits: + self.kcpp_tensor_split.append(int(s)) + + accel = parameters["kcpp_accelerator"] + if accel==0: + self.kcpp_noblas = True + elif accel==1: + pass + elif accel==2: + self.kcpp_usecublas = ["normal"] + elif accel==3: + self.kcpp_useclblast = [0,0] + elif accel==4: + self.kcpp_useclblast = [1,0] + elif accel==5: + self.kcpp_useclblast = [0,1] + elif accel==6: + self.kcpp_noavx2 = True + elif accel==7: + self.kcpp_noavx2 = True + self.kcpp_noblas = True + self.kcpp_nommap = True + pass + + def unload(self): + print("Attemping to unload library") + koboldcpp.unload_libs() + global kcpp_backend_loaded + kcpp_backend_loaded = False + pass + + def _load(self, save_model: bool, initial_load: bool) -> None: + global kcpp_backend_loaded + self.tokenizer = self._get_tokenizer("gpt2") + if not kcpp_backend_loaded: + kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename, + port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads, + psutil_set_threads=False, highpriority=False, contextsize=self.kcpp_ctxsize, + blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext, + unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap, + usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas, + useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None, onready='', multiuser=False) + + koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server + kcpp_backend_loaded = True + pass + + def _save_settings(self): + pass + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + seed: Optional[int] = None, + **kwargs, + ) -> GenerationResult: + + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length, + gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p, + gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range, + sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids) + + outputs = [genresult] + return GenerationResult( + model=self, + out_batches=np.array( + [self.tokenizer.encode(x) for x in outputs] + ), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000000000000000000000000000000000000..a301c5b2c769437b20d78d126e335a586a27eff6 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,14 @@ +comment: off + +coverage: + status: + project: + default: + target: auto + threshold: 0 + base: auto + patch: + default: + target: auto + threshold: 0 + base: auto diff --git a/colab.ipynb b/colab.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2ebdfc9b58c212399af814a5097f5deec27de9a5 --- /dev/null +++ b/colab.ipynb @@ -0,0 +1,61 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "private_outputs": true, + "provenance": [], + "gpuType": "T4", + "authorship_tag": "ABX9TyOv14c2MWENhO6RJ3uy6vD7", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "uJS9i_Dltv8Y" + }, + "outputs": [], + "source": [ + "#@title v-- Enter your model below and then click this to start Koboldcpp\n", + "\n", + "Model = \"https://huggingface.co/TheBloke/airoboros-l2-13B-gpt4-1.4.1-GGML/resolve/main/airoboros-l2-13b-gpt4-1.4.1.ggmlv3.q4_0.bin\" #@param [\"\"]{allow-input: true}\n", + "Layers = 43 #@param [43]{allow-input: true}\n", + "\n", + "%cd /content\n", + "!git clone https://github.com/LostRuins/koboldcpp\n", + "%cd /content/koboldcpp\n", + "!make LLAMA_CUBLAS=1\n", + "\n", + "!wget $Model -O model.ggml\n", + "!wget -c https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64\n", + "!chmod +x cloudflared-linux-amd64\n", + "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\n", + "!sleep 10\n", + "!cat nohup.out\n", + "!python koboldcpp.py model.ggml --stream --usecublas 0 --gpulayers $Layers --hordeconfig concedo\n" + ] + } + ] +} \ No newline at end of file diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..dead56118bac828008c0c1e3261b31c1b3d4666a --- /dev/null +++ b/common/CMakeLists.txt @@ -0,0 +1,20 @@ +# common + +set(TARGET common) + +add_library(${TARGET} OBJECT + common.h + common.cpp + console.h + console.cpp + grammar-parser.h + grammar-parser.cpp + ) + +if (BUILD_SHARED_LIBS) + set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif() + +target_include_directories(${TARGET} PUBLIC .) +target_compile_features(${TARGET} PUBLIC cxx_std_11) +target_link_libraries(${TARGET} PRIVATE llama) diff --git a/common/common.cpp b/common/common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2597ba06aee16584c3591250effc438b743b895b --- /dev/null +++ b/common/common.cpp @@ -0,0 +1,1270 @@ +#include "common.h" +#include "build-info.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__APPLE__) && defined(__MACH__) +#include +#include +#endif + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#include +#include +#include +#include +#else +#include +#include +#include +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +int32_t get_num_physical_cores() { +#ifdef __linux__ + // enumerate the set of thread siblings, num entries is num cores + std::unordered_set siblings; + for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { + std::ifstream thread_siblings("/sys/devices/system/cpu" + + std::to_string(cpu) + "/topology/thread_siblings"); + if (!thread_siblings.is_open()) { + break; // no more cpus + } + std::string line; + if (std::getline(thread_siblings, line)) { + siblings.insert(line); + } + } + if (!siblings.empty()) { + return static_cast(siblings.size()); + } +#elif defined(__APPLE__) && defined(__MACH__) + int32_t num_physical_cores; + size_t len = sizeof(num_physical_cores); + int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } + result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } +#elif defined(_WIN32) + //TODO: Implement +#endif + unsigned int n_threads = std::thread::hardware_concurrency(); + return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; +} + +static void process_escapes(std::string& input) { + std::size_t input_len = input.length(); + std::size_t output_idx = 0; + + for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) { + if (input[input_idx] == '\\' && input_idx + 1 < input_len) { + switch (input[++input_idx]) { + case 'n': input[output_idx++] = '\n'; break; + case 'r': input[output_idx++] = '\r'; break; + case 't': input[output_idx++] = '\t'; break; + case '\'': input[output_idx++] = '\''; break; + case '\"': input[output_idx++] = '\"'; break; + case '\\': input[output_idx++] = '\\'; break; + default: input[output_idx++] = '\\'; + input[output_idx++] = input[input_idx]; break; + } + } else { + input[output_idx++] = input[input_idx]; + } + } + + input.resize(output_idx); +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + bool invalid_param = false; + std::string arg; + gpt_params default_params; + const std::string arg_prefix = "--"; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.seed = std::stoul(argv[i]); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = std::thread::hardware_concurrency(); + } + } else if (arg == "-p" || arg == "--prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.prompt = argv[i]; + } else if (arg == "-e" || arg == "--escape") { + params.escape = true; + } else if (arg == "--prompt-cache") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.path_prompt_cache = argv[i]; + } else if (arg == "--prompt-cache-all") { + params.prompt_cache_all = true; + } else if (arg == "--prompt-cache-ro") { + params.prompt_cache_ro = true; + } else if (arg == "-f" || arg == "--file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } else if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + } else if (arg == "--top-k") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_k = std::stoi(argv[i]); + } else if (arg == "-c" || arg == "--ctx-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } else if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = 1.0f/std::stof(argv[i]); + } else if (arg == "--memory-f32") { + params.memory_f16 = false; + } else if (arg == "--top-p") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.top_p = std::stof(argv[i]); + } else if (arg == "--temp") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.temp = std::stof(argv[i]); + } else if (arg == "--tfs") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.tfs_z = std::stof(argv[i]); + } else if (arg == "--typical") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.typical_p = std::stof(argv[i]); + } else if (arg == "--repeat-last-n") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_last_n = std::stoi(argv[i]); + } else if (arg == "--repeat-penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.repeat_penalty = std::stof(argv[i]); + } else if (arg == "--frequency-penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.frequency_penalty = std::stof(argv[i]); + } else if (arg == "--presence-penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.presence_penalty = std::stof(argv[i]); + } else if (arg == "--mirostat") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat = std::stoi(argv[i]); + } else if (arg == "--mirostat-lr") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat_eta = std::stof(argv[i]); + } else if (arg == "--mirostat-ent") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat_tau = std::stof(argv[i]); + } else if (arg == "--cfg-negative-prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cfg_negative_prompt = argv[i]; + } else if (arg == "--cfg-negative-prompt-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.cfg_negative_prompt)); + if (params.cfg_negative_prompt.back() == '\n') { + params.cfg_negative_prompt.pop_back(); + } + } else if (arg == "--cfg-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cfg_scale = std::stof(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + } else if (arg == "--keep") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_keep = std::stoi(argv[i]); + } else if (arg == "--draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_draft = std::stoi(argv[i]); + } else if (arg == "--chunks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_chunks = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-md" || arg == "--model-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_draft = argv[i]; + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } else if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + } else if (arg == "--embedding") { + params.embedding = true; + } else if (arg == "--interactive-first") { + params.interactive_first = true; + } else if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; + } else if (arg == "--multiline-input") { + params.multiline_input = true; + } else if (arg == "--simple-io") { + params.simple_io = true; + } else if (arg == "--color") { + params.use_color = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif + } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers_draft = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif + } else if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + params.main_gpu = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); +#endif + } else if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; + std::vector split_arg{it, {}}; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } else { + params.tensor_split[i] = 0.0f; + } + } +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { +#ifdef GGML_USE_CUBLAS + params.mul_mat_q = false; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--low-vram" || arg == "-lv") { +#ifdef GGML_USE_CUBLAS + params.low_vram = true; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else if (arg == "--numa") { + params.numa = true; + } else if (arg == "--export") { + params.export_cgraph = true; + } else if (arg == "--verbose-prompt") { + params.verbose_prompt = true; + } else if (arg == "-r" || arg == "--reverse-prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.antiprompt.push_back(argv[i]); + } else if (arg == "-ld" || arg == "--logdir") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.logdir = argv[i]; + + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + } else if (arg == "--perplexity") { + params.perplexity = true; + } else if (arg == "--ppl-stride") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.ppl_stride = std::stoi(argv[i]); + } else if (arg == "--ppl-output-type") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.ppl_output_type = std::stoi(argv[i]); + } else if (arg == "--hellaswag") { + params.hellaswag = true; + } else if (arg == "--hellaswag-tasks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.hellaswag_tasks = std::stoi(argv[i]); + } else if (arg == "--ignore-eos") { + params.ignore_eos = true; + } else if (arg == "--no-penalize-nl") { + params.penalize_nl = false; + } else if (arg == "-l" || arg == "--logit-bias") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::stringstream ss(argv[i]); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + } else { + throw std::exception(); + } + } catch (const std::exception&) { + invalid_param = true; + break; + } + } else if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, default_params); +#ifndef LOG_DISABLE_LOGS + log_print_usage(); +#endif // LOG_DISABLE_LOGS + exit(0); + } else if (arg == "--random-prompt") { + params.random_prompt = true; + } else if (arg == "--in-prefix-bos") { + params.input_prefix_bos = true; + } else if (arg == "--in-prefix") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.input_prefix = argv[i]; + } else if (arg == "--in-suffix") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.input_suffix = argv[i]; + } else if (arg == "--grammar") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.grammar = argv[i]; + } else if (arg == "--grammar-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(params.grammar) + ); +#ifndef LOG_DISABLE_LOGS + // Parse args for logging parameters + } else if ( log_param_single_parse( argv[i] ) ) { + // Do nothing, log_param_single_parse automatically does it's thing + // and returns if a match was found and parsed. + } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + // We have a matching known parameter requiring an argument, + // now we need to check if there is anything after this argv + // and flag invalid_param or parse it. + if (++i >= argc) { + invalid_param = true; + break; + } + if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) { + invalid_param = true; + break; + } + // End of Parse args for logging parameters +#endif // LOG_DISABLE_LOGS + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, default_params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, default_params); + exit(1); + } + if (params.prompt_cache_all && + (params.interactive || params.interactive_first || + params.instruct)) { + fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n"); + gpt_print_usage(argc, argv, default_params); + exit(1); + } + + if (params.escape) { + process_escapes(params.prompt); + process_escapes(params.input_prefix); + process_escapes(params.input_suffix); + } + + return true; +} + +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -i, --interactive run in interactive mode\n"); + printf(" --interactive-first run in interactive mode and wait for input right away\n"); + printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); + printf(" -r PROMPT, --reverse-prompt PROMPT\n"); + printf(" halt generation at PROMPT, return control in interactive mode\n"); + printf(" (can be specified more than once for multiple prompts).\n"); + printf(" --color colorise output to distinguish prompt and user input from generations\n"); + printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -p PROMPT, --prompt PROMPT\n"); + printf(" prompt to start generation with (default: empty)\n"); + printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); + printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); + printf(" not supported with --interactive or other interactive options\n"); + printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); + printf(" --random-prompt start with a randomized prompt.\n"); + printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); + printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); + printf(" -f FNAME, --file FNAME\n"); + printf(" prompt file to start generation.\n"); + printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); + printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); + printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); + printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); + printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); + printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); + printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); + printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + printf(" --mirostat N use Mirostat sampling.\n"); + printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); + printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); + printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); + printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); + printf(" modifies the likelihood of token appearing in the completion,\n"); + printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); + printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); + printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); + printf(" --grammar-file FNAME file to read grammar from\n"); + printf(" --cfg-negative-prompt PROMPT\n"); + printf(" negative prompt to use for guidance. (default: empty)\n"); + printf(" --cfg-negative-prompt-file FNAME\n"); + printf(" negative prompt file to use for guidance. (default: empty)\n"); + printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); + printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); + printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); + printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); + printf(" --no-penalize-nl do not penalize newline token\n"); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); + printf(" --perplexity compute perplexity over each ctx window of the prompt\n"); + printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); + printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); + printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); + printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); + if (llama_mlock_supported()) { + printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) { + printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } + printf(" --numa attempt optimizations that help on some NUMA systems\n"); + printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); + printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ngld N, --n-gpu-layers-draft N\n"); + printf(" number of layers to store in VRAM for the draft model\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n"); +#ifdef GGML_USE_CUBLAS + printf(" -nommq, --no-mul-mat-q\n"); + printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); + printf(" Not recommended since this is both slower and uses more VRAM.\n"); +#endif // GGML_USE_CUBLAS +#endif + printf(" --export export the computation graph to 'llama.ggml'\n"); + printf(" --verbose-prompt print prompt before generation\n"); + fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); + printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" -m FNAME, --model FNAME\n"); + printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -md FNAME, --model-draft FNAME\n"); + printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); + printf(" -ld LOGDIR, --logdir LOGDIR\n"); + printf(" path under which to save YAML logs (no logging if unset)\n"); + printf("\n"); +} + +std::string gpt_random_prompt(std::mt19937 & rng) { + const int r = rng() % 10; + switch (r) { + case 0: return "So"; + case 1: return "Once upon a time"; + case 2: return "When"; + case 3: return "The"; + case 4: return "After"; + case 5: return "If"; + case 6: return "import"; + case 7: return "He"; + case 8: return "She"; + case 9: return "They"; + default: return "To"; + } + + return "The"; +} + +// +// Model utils +// + +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { + auto lparams = llama_context_default_params(); + + lparams.n_ctx = params.n_ctx; + lparams.n_batch = params.n_batch; + if (params.n_gpu_layers != -1) { + lparams.n_gpu_layers = params.n_gpu_layers; + } + lparams.main_gpu = params.main_gpu; + lparams.tensor_split = params.tensor_split; + lparams.low_vram = params.low_vram; + lparams.mul_mat_q = params.mul_mat_q; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; + lparams.logits_all = params.perplexity; + lparams.embedding = params.embedding; + lparams.rope_freq_base = params.rope_freq_base; + lparams.rope_freq_scale = params.rope_freq_scale; + + return lparams; +} + +std::tuple llama_init_from_gpt_params(gpt_params & params) { + auto lparams = llama_context_params_from_gpt_params(params); + + llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return std::make_tuple(nullptr, nullptr); + } + + llama_context * lctx = llama_new_context_with_model(model, lparams); + if (lctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + if (!params.lora_adapter.empty()) { + int err = llama_model_apply_lora_from_file(model, + params.lora_adapter.c_str(), + params.lora_base.empty() ? NULL : params.lora_base.c_str(), + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + + if (params.ignore_eos) { + params.logit_bias[llama_token_eos(lctx)] = -INFINITY; + } + + { + LOG("warming up the model with an empty run\n"); + + const std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; + llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads); + llama_reset_timings(lctx); + } + + return std::make_tuple(model, lctx); +} + +// +// Vocab utils +// + +std::vector llama_tokenize( + struct llama_context * ctx, + const std::string & text, + bool add_bos) { + // upper limit for the number of tokens + int n_tokens = text.length() + add_bos; + std::vector result(n_tokens); + n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos); + if (n_tokens < 0) { + result.resize(-n_tokens); + int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos); + GGML_ASSERT(check == -n_tokens); + } else { + result.resize(n_tokens); + } + return result; +} + +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { + std::vector result(8, 0); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); + if (n_tokens < 0) { + result.resize(-n_tokens); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); + GGML_ASSERT(check == -n_tokens); + } else { + result.resize(n_tokens); + } + + return std::string(result.data(), result.size()); +} + +std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { + const llama_token bos_id = llama_token_bos(ctx); + + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + // remove the leading space of the first non-BOS token + if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { + piece = piece.substr(1); + } + + result += piece; + } + + return result; +} + +std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) { + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + result += piece; + } + + return result; +} + +// +// Sampling utils +// + +llama_token llama_sample_token( + struct llama_context * ctx, + struct llama_context * ctx_guidance, + struct llama_grammar * grammar, + const struct gpt_params & params, + const std::vector & last_tokens, + std::vector & candidates, + int idx) { + const int n_ctx = llama_n_ctx(ctx); + const int n_vocab = llama_n_vocab(ctx); + + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + + llama_token id = 0; + + float * logits = llama_get_logits(ctx) + idx * n_vocab; + + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + candidates.clear(); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array cur_p = { candidates.data(), candidates.size(), false }; + + if (ctx_guidance) { + llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale); + } + + // apply penalties + if (!last_tokens.empty()) { + const float nl_logit = logits[llama_token_nl(ctx)]; + const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx); + + llama_sample_repetition_penalty(ctx, &cur_p, + last_tokens.data() + last_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &cur_p, + last_tokens.data() + last_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + + if (!penalize_nl) { + for (size_t idx = 0; idx < cur_p.size; idx++) { + if (cur_p.data[idx].id == llama_token_nl(ctx)) { + cur_p.data[idx].logit = nl_logit; + break; + } + } + } + } + + if (grammar != NULL) { + llama_sample_grammar(ctx, &cur_p, grammar); + } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &cur_p); + } else { + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &cur_p, temp); + id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &cur_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k (ctx, &cur_p, top_k, 1); + llama_sample_tail_free (ctx, &cur_p, tfs_z, 1); + llama_sample_typical (ctx, &cur_p, typical_p, 1); + llama_sample_top_p (ctx, &cur_p, top_p, 1); + llama_sample_temperature(ctx, &cur_p, temp); + + { + const int n_top = 10; + LOG("top %d candidates:\n", n_top); + + for (int i = 0; i < n_top; i++) { + const llama_token id = cur_p.data[i].id; + LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p); + } + } + + id = llama_sample_token(ctx, &cur_p); + + LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str()); + } + } + // printf("`%d`", candidates_p.size); + + if (grammar != NULL) { + llama_grammar_accept_token(ctx, grammar, id); + } + + return id; +} + +// +// YAML utils +// + +// returns true if successful, false otherwise +bool create_directory_with_parents(const std::string & path) { +#ifdef _WIN32 + std::wstring_convert> converter; + std::wstring wpath = converter.from_bytes(path); + + // if the path already exists, check whether it's a directory + const DWORD attributes = GetFileAttributesW(wpath.c_str()); + if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + return true; + } + + size_t pos_slash = 0; + + // process path from front to back, procedurally creating directories + while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { + const std::wstring subpath = wpath.substr(0, pos_slash); + const wchar_t * test = subpath.c_str(); + + const bool success = CreateDirectoryW(test, NULL); + if (!success) { + const DWORD error = GetLastError(); + + // if the path already exists, ensure that it's a directory + if (error == ERROR_ALREADY_EXISTS) { + const DWORD attributes = GetFileAttributesW(subpath.c_str()); + if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) { + return false; + } + } else { + return false; + } + } + + pos_slash += 1; + } + + return true; +#else + // if the path already exists, check whether it's a directory + struct stat info; + if (stat(path.c_str(), &info) == 0) { + return S_ISDIR(info.st_mode); + } + + size_t pos_slash = 1; // skip leading slashes for directory creation + + // process path from front to back, procedurally creating directories + while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) { + const std::string subpath = path.substr(0, pos_slash); + struct stat info; + + // if the path already exists, ensure that it's a directory + if (stat(subpath.c_str(), &info) == 0) { + if (!S_ISDIR(info.st_mode)) { + return false; + } + } else { + // create parent directories + const int ret = mkdir(subpath.c_str(), 0755); + if (ret != 0) { + return false; + } + } + + pos_slash += 1; + } + + return true; +#endif // _WIN32 +} + +void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%e, ", data[i]); + } + fprintf(stream, "%e]\n", data.back()); +} + +void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%d, ", data[i]); + } + fprintf(stream, "%d]\n", data.back()); +} + +void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) { + std::string data_str(data == NULL ? "" : data); + + if (data_str.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + size_t pos_start = 0; + size_t pos_found = 0; + + if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) { + data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); + data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); + data_str = "\"" + data_str + "\""; + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + if (data_str.find('\n') == std::string::npos) { + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + fprintf(stream, "%s: |\n", prop_name); + while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { + fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); + pos_start = pos_found + 1; + } +} + +std::string get_sortable_timestamp() { + using clock = std::chrono::system_clock; + + const clock::time_point current_time = clock::now(); + const time_t as_time_t = clock::to_time_t(current_time); + char timestamp_no_ns[100]; + std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t)); + + const int64_t ns = std::chrono::duration_cast( + current_time.time_since_epoch() % 1000000000).count(); + char timestamp_ns[11]; + snprintf(timestamp_ns, 11, "%09" PRId64, ns); + + return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); +} + +void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); + fprintf(stream, "build_number: %d\n", BUILD_NUMBER); + fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); + fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); + fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); + fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + +#ifdef NDEBUG + fprintf(stream, "debug: false\n"); +#else + fprintf(stream, "debug: true\n"); +#endif // NDEBUG + + fprintf(stream, "model_desc: %s\n", model_desc); + fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx)); + +#ifdef __OPTIMIZE__ + fprintf(stream, "optimize: true\n"); +#else + fprintf(stream, "optimize: false\n"); +#endif // __OPTIMIZE__ + + fprintf(stream, "time: %s\n", timestamp.c_str()); + + fprintf(stream, "\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "# User Inputs #\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "\n"); + + fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); + fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); + dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str()); + fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale); + fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); + fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); + fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); + fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); + fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false"); + fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); + fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty); + dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); + fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); + fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); + fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); + + const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx)); + const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY; + fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); + + dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str()); + fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); + dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str()); + fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); + fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); + fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); + fprintf(stream, "keep: %d # default: 0\n", params.n_keep); + fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); + + fprintf(stream, "logit_bias:\n"); + for (std::pair lb : params.logit_bias) { + if (ignore_eos && lb.first == logit_bias_eos->first) { + continue; + } + fprintf(stream, " %d: %f", lb.first, lb.second); + } + + fprintf(stream, "lora: %s\n", params.lora_adapter.c_str()); + fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); + fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false"); + fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); + fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat); + fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau); + fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta); + fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); + fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); + fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); + fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); + fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); + fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); + fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs); + fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); + fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); + fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false"); + fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); + fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); + fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); + fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty); + dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str()); + fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); + fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); + fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); + dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens); + fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); + fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty); + + fprintf(stream, "reverse_prompt:\n"); + for (std::string ap : params.antiprompt) { + size_t pos = 0; + while ((pos = ap.find('\n', pos)) != std::string::npos) { + ap.replace(pos, 1, "\\n"); + pos += 1; + } + + fprintf(stream, " - %s\n", ap.c_str()); + } + + fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); + fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); + fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); + fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); + fprintf(stream, "temp: %f # default: 0.8\n", params.temp); + + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); + dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); + + fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z); + fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "top_k: %d # default: 40\n", params.top_k); + fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p); + fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p); + fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); +} diff --git a/common/common.h b/common/common.h new file mode 100644 index 0000000000000000000000000000000000000000..18aea38cee28c19e960e5da00b32e550465f4105 --- /dev/null +++ b/common/common.h @@ -0,0 +1,211 @@ +// Various helper functions and utilities + +#pragma once + +#include "llama.h" + +#define LOG_NO_FILE_LINE_FUNCTION +#include "log.h" + +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define DIRECTORY_SEPARATOR '\\' +#else +#define DIRECTORY_SEPARATOR '/' +#endif // _WIN32 + +#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) +#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) + +#define print_build_info() do { \ + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \ + fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \ +} while(0) + +// +// CLI argument parsing +// +int32_t get_num_physical_cores(); + +struct gpt_params { + uint32_t seed = -1; // RNG seed + int32_t n_threads = get_num_physical_cores(); + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 16; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t n_beams = 0; // if non-zero then use beam search of given width. + float rope_freq_base = 10000.0f; // RoPE base frequency + float rope_freq_scale = 1.0f; // RoPE frequency scaling factor + + // sampling parameters + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // 1.0 = disabled + float repeat_penalty = 1.10f; // 1.0 = disabled + int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float frequency_penalty = 0.00f; // 0.0 = disabled + float presence_penalty = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + + std::unordered_map logit_bias; // logit bias for specific tokens + + // Classifier-Free Guidance + // https://arxiv.org/abs/2306.17806 + std::string cfg_negative_prompt; // string to help guidance + float cfg_scale = 1.f; // How strong is guidance + + std::string model = "models/7B/ggml-model-f16.gguf"; // model path + std::string model_draft = ""; // draft model for speculative decoding + std::string model_alias = "unknown"; // model alias + std::string prompt = ""; + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with + std::string grammar = ""; // optional BNF-like grammar to constrain sampling + std::vector antiprompt; // string upon seeing which more user input is prompted + std::string logdir = ""; // directory in which to save YAML log files + + std::string lora_adapter = ""; // lora adapter path + std::string lora_base = ""; // base model path for the lora adapter + + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. + int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line + // (which is more convenient to use for plotting) + // + bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score + + bool low_vram = false; // if true, reduce VRAM usage at the cost of performance + bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS + bool memory_f16 = true; // use f16 instead of f32 for memory kv + bool random_prompt = false; // do not randomize prompt if none provided + bool use_color = false; // use color to distinguish generations and inputs + bool interactive = false; // interactive mode + bool prompt_cache_all = false; // save user input and generations to prompt cache + bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it + + bool embedding = false; // get only sentence embedding + bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\" + bool interactive_first = false; // wait for user input immediately + bool multiline_input = false; // reverse the usage of `\` + bool simple_io = false; // improves compatibility with subprocesses and limited consoles + + bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix + bool ignore_eos = false; // ignore generated EOS tokens + bool instruct = false; // instruction mode (used for Alpaca models) + bool penalize_nl = true; // consider newlines as a repeatable token + bool perplexity = false; // compute perplexity over the prompt + bool use_mmap = true; // use mmap for faster loads + bool use_mlock = false; // use mlock to keep model in memory + bool numa = false; // attempt optimizations that help on some NUMA systems + bool export_cgraph = false; // export the computation graph + bool verbose_prompt = false; // print prompt tokens before generation +}; + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params); + +void gpt_print_usage(int argc, char ** argv, const gpt_params & params); + +std::string gpt_random_prompt(std::mt19937 & rng); + +// +// Model utils +// + +std::tuple llama_init_from_gpt_params(gpt_params & params); +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); + +// +// Vocab utils +// + +// tokenizes a string into a vector of tokens +// should work similar to Python's `tokenizer.encode` +std::vector llama_tokenize( + struct llama_context * ctx, + const std::string & text, + bool add_bos); + +// tokenizes a token into a piece +// should work similar to Python's `tokenizer.id_to_piece` +std::string llama_token_to_piece( + const struct llama_context * ctx, + llama_token token); + +// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function +// that takes into account the tokenizer type and decides how to handle the leading space +// +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +// removes the leading space from the first non-BOS token +std::string llama_detokenize_spm( + llama_context * ctx, + const std::vector & tokens); + +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +std::string llama_detokenize_bpe( + llama_context * ctx, + const std::vector & tokens); + +// +// Sampling utils +// + +// this is a common sampling function used across the examples for convenience +// it can serve as a starting point for implementing your own sampling function +// +// required: +// - ctx: context to use for sampling +// - params: sampling parameters +// +// optional: +// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL +// - grammar: grammar to use for sampling, ignore if NULL +// - last_tokens: needed for repetition penalty, ignore if empty +// - idx: sample from llama_get_logits(ctx) + idx * n_vocab +// +// returns: +// - token: sampled token +// - candidates: vector of candidate tokens +// +llama_token llama_sample_token( + struct llama_context * ctx, + struct llama_context * ctx_guidance, + struct llama_grammar * grammar, + const struct gpt_params & params, + const std::vector & last_tokens, + std::vector & candidates, + int idx = 0); + +// +// YAML utils +// + +bool create_directory_with_parents(const std::string & path); +void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data); +void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data); +void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data); +std::string get_sortable_timestamp(); + +void dump_non_result_info_yaml( + FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/common/console.cpp b/common/console.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f65cbc6eda0b1d1e4f45ab976fb8868be33b6c79 --- /dev/null +++ b/common/console.cpp @@ -0,0 +1,501 @@ +#include "console.h" +#include +#include + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#include +#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING +#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004 +#endif +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" +#define ANSI_BOLD "\x1b[1m" + +namespace console { + + // + // Console state + // + + static bool advanced_display = false; + static bool simple_io = true; + static display_t current_display = reset; + + static FILE* out = stdout; + +#if defined (_WIN32) + static void* hConsole; +#else + static FILE* tty = nullptr; + static termios initial_state; +#endif + + // + // Init and cleanup + // + + void init(bool use_simple_io, bool use_advanced_display) { + advanced_display = use_advanced_display; + simple_io = use_simple_io; +#if defined(_WIN32) + // Windows-specific console initialization + DWORD dwMode = 0; + hConsole = GetStdHandle(STD_OUTPUT_HANDLE); + if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) { + hConsole = GetStdHandle(STD_ERROR_HANDLE); + if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) { + hConsole = nullptr; + simple_io = true; + } + } + if (hConsole) { + // Check conditions combined to reduce nesting + if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) && + !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) { + advanced_display = false; + } + // Set console output codepage to UTF8 + SetConsoleOutputCP(CP_UTF8); + } + HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE); + if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) { + // Set console input codepage to UTF16 + _setmode(_fileno(stdin), _O_WTEXT); + + // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) + if (simple_io) { + dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT; + } else { + dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); + } + if (!SetConsoleMode(hConIn, dwMode)) { + simple_io = true; + } + } +#else + // POSIX-specific console initialization + if (!simple_io) { + struct termios new_termios; + tcgetattr(STDIN_FILENO, &initial_state); + new_termios = initial_state; + new_termios.c_lflag &= ~(ICANON | ECHO); + new_termios.c_cc[VMIN] = 1; + new_termios.c_cc[VTIME] = 0; + tcsetattr(STDIN_FILENO, TCSANOW, &new_termios); + + tty = fopen("/dev/tty", "w+"); + if (tty != nullptr) { + out = tty; + } + } + + setlocale(LC_ALL, ""); +#endif + } + + void cleanup() { + // Reset console display + set_display(reset); + +#if !defined(_WIN32) + // Restore settings on POSIX systems + if (!simple_io) { + if (tty != nullptr) { + out = stdout; + fclose(tty); + tty = nullptr; + } + tcsetattr(STDIN_FILENO, TCSANOW, &initial_state); + } +#endif + } + + // + // Display and IO + // + + // Keep track of current display and only emit ANSI code if it changes + void set_display(display_t display) { + if (advanced_display && current_display != display) { + fflush(stdout); + switch(display) { + case reset: + fprintf(out, ANSI_COLOR_RESET); + break; + case prompt: + fprintf(out, ANSI_COLOR_YELLOW); + break; + case user_input: + fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN); + break; + case error: + fprintf(out, ANSI_BOLD ANSI_COLOR_RED); + } + current_display = display; + fflush(out); + } + } + + static char32_t getchar32() { +#if defined(_WIN32) + HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); + wchar_t high_surrogate = 0; + + while (true) { + INPUT_RECORD record; + DWORD count; + if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) { + return WEOF; + } + + if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) { + wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar; + if (wc == 0) { + continue; + } + + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + high_surrogate = wc; + continue; + } + if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate + if (high_surrogate != 0) { // Check if we have a high surrogate + return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000; + } + } + + high_surrogate = 0; // Reset the high surrogate + return static_cast(wc); + } + } +#else + wchar_t wc = getwchar(); + if (static_cast(wc) == WEOF) { + return WEOF; + } + +#if WCHAR_MAX == 0xFFFF + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + wchar_t low_surrogate = getwchar(); + if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate + return (static_cast(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000; + } + } + if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair + return 0xFFFD; // Return the replacement character U+FFFD + } +#endif + + return static_cast(wc); +#endif + } + + static void pop_cursor() { +#if defined(_WIN32) + if (hConsole != NULL) { + CONSOLE_SCREEN_BUFFER_INFO bufferInfo; + GetConsoleScreenBufferInfo(hConsole, &bufferInfo); + + COORD newCursorPosition = bufferInfo.dwCursorPosition; + if (newCursorPosition.X == 0) { + newCursorPosition.X = bufferInfo.dwSize.X - 1; + newCursorPosition.Y -= 1; + } else { + newCursorPosition.X -= 1; + } + + SetConsoleCursorPosition(hConsole, newCursorPosition); + return; + } +#endif + putc('\b', out); + } + + static int estimateWidth(char32_t codepoint) { +#if defined(_WIN32) + (void)codepoint; + return 1; +#else + return wcwidth(codepoint); +#endif + } + + static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { +#if defined(_WIN32) + CONSOLE_SCREEN_BUFFER_INFO bufferInfo; + if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) { + // go with the default + return expectedWidth; + } + COORD initialPosition = bufferInfo.dwCursorPosition; + DWORD nNumberOfChars = length; + WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL); + + CONSOLE_SCREEN_BUFFER_INFO newBufferInfo; + GetConsoleScreenBufferInfo(hConsole, &newBufferInfo); + + // Figure out our real position if we're in the last column + if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) { + DWORD nNumberOfChars; + WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL); + GetConsoleScreenBufferInfo(hConsole, &newBufferInfo); + } + + int width = newBufferInfo.dwCursorPosition.X - initialPosition.X; + if (width < 0) { + width += newBufferInfo.dwSize.X; + } + return width; +#else + // We can trust expectedWidth if we've got one + if (expectedWidth >= 0 || tty == nullptr) { + fwrite(utf8_codepoint, length, 1, out); + return expectedWidth; + } + + fputs("\033[6n", tty); // Query cursor position + int x1; + int y1; + int x2; + int y2; + int results = 0; + results = fscanf(tty, "\033[%d;%dR", &y1, &x1); + + fwrite(utf8_codepoint, length, 1, tty); + + fputs("\033[6n", tty); // Query cursor position + results += fscanf(tty, "\033[%d;%dR", &y2, &x2); + + if (results != 4) { + return expectedWidth; + } + + int width = x2 - x1; + if (width < 0) { + // Calculate the width considering text wrapping + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + width += w.ws_col; + } + return width; +#endif + } + + static void replace_last(char ch) { +#if defined(_WIN32) + pop_cursor(); + put_codepoint(&ch, 1, 1); +#else + fprintf(out, "\b%c", ch); +#endif + } + + static void append_utf8(char32_t ch, std::string & out) { + if (ch <= 0x7F) { + out.push_back(static_cast(ch)); + } else if (ch <= 0x7FF) { + out.push_back(static_cast(0xC0 | ((ch >> 6) & 0x1F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0xFFFF) { + out.push_back(static_cast(0xE0 | ((ch >> 12) & 0x0F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0x10FFFF) { + out.push_back(static_cast(0xF0 | ((ch >> 18) & 0x07))); + out.push_back(static_cast(0x80 | ((ch >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else { + // Invalid Unicode code point + } + } + + // Helper function to remove the last UTF-8 character from a string + static void pop_back_utf8_char(std::string & line) { + if (line.empty()) { + return; + } + + size_t pos = line.length() - 1; + + // Find the start of the last UTF-8 character (checking up to 4 bytes back) + for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) { + if ((line[pos] & 0xC0) != 0x80) { + break; // Found the start of the character + } + } + line.erase(pos); + } + + static bool readline_advanced(std::string & line, bool multiline_input) { + if (out != stdout) { + fflush(stdout); + } + + line.clear(); + std::vector widths; + bool is_special_char = false; + bool end_of_stream = false; + + char32_t input_char; + while (true) { + fflush(out); // Ensure all output is displayed before waiting for input + input_char = getchar32(); + + if (input_char == '\r' || input_char == '\n') { + break; + } + + if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) { + end_of_stream = true; + break; + } + + if (is_special_char) { + set_display(user_input); + replace_last(line.back()); + is_special_char = false; + } + + if (input_char == '\033') { // Escape sequence + char32_t code = getchar32(); + if (code == '[' || code == 0x1B) { + // Discard the rest of the escape sequence + while ((code = getchar32()) != (char32_t) WEOF) { + if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') { + break; + } + } + } + } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace + if (!widths.empty()) { + int count; + do { + count = widths.back(); + widths.pop_back(); + // Move cursor back, print space, and move cursor back again + for (int i = 0; i < count; i++) { + replace_last(' '); + pop_cursor(); + } + pop_back_utf8_char(line); + } while (count == 0 && !widths.empty()); + } + } else { + int offset = line.length(); + append_utf8(input_char, line); + int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char)); + if (width < 0) { + width = 0; + } + widths.push_back(width); + } + + if (!line.empty() && (line.back() == '\\' || line.back() == '/')) { + set_display(prompt); + replace_last(line.back()); + is_special_char = true; + } + } + + bool has_more = multiline_input; + if (is_special_char) { + replace_last(' '); + pop_cursor(); + + char last = line.back(); + line.pop_back(); + if (last == '\\') { + line += '\n'; + fputc('\n', out); + has_more = !has_more; + } else { + // llama will just eat the single space, it won't act as a space + if (line.length() == 1 && line.back() == ' ') { + line.clear(); + pop_cursor(); + } + has_more = false; + } + } else { + if (end_of_stream) { + has_more = false; + } else { + line += '\n'; + fputc('\n', out); + } + } + + fflush(out); + return has_more; + } + + static bool readline_simple(std::string & line, bool multiline_input) { +#if defined(_WIN32) + std::wstring wline; + if (!std::getline(std::wcin, wline)) { + // Input stream is bad or EOF received + line.clear(); + GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0); + return false; + } + + int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL); + line.resize(size_needed); + WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL); +#else + if (!std::getline(std::cin, line)) { + // Input stream is bad or EOF received + line.clear(); + return false; + } +#endif + if (!line.empty()) { + char last = line.back(); + if (last == '/') { // Always return control on '/' symbol + line.pop_back(); + return false; + } + if (last == '\\') { // '\\' changes the default action + line.pop_back(); + multiline_input = !multiline_input; + } + } + line += '\n'; + + // By default, continue input if multiline_input is set + return multiline_input; + } + + bool readline(std::string & line, bool multiline_input) { + set_display(user_input); + + if (simple_io) { + return readline_simple(line, multiline_input); + } + return readline_advanced(line, multiline_input); + } + +} diff --git a/common/console.h b/common/console.h new file mode 100644 index 0000000000000000000000000000000000000000..ec175269b9d8af48803d0b6e618d008a9ab99b4d --- /dev/null +++ b/common/console.h @@ -0,0 +1,19 @@ +// Console functions + +#pragma once + +#include + +namespace console { + enum display_t { + reset = 0, + prompt, + user_input, + error + }; + + void init(bool use_simple_io, bool use_advanced_display); + void cleanup(); + void set_display(display_t display); + bool readline(std::string & line, bool multiline_input); +} diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5a545a8076460a55ed83ab4437f387f8aee7d31e --- /dev/null +++ b/common/grammar-parser.cpp @@ -0,0 +1,424 @@ +#include "grammar-parser.h" +#include +#include +#include +#include +#include +#include + +namespace grammar_parser { + // NOTE: assumes valid utf8 (but checks for overrun) + // copied from llama.cpp + static std::pair decode_utf8(const char * src) { + static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t first_byte = static_cast(*src); + uint8_t highbits = first_byte >> 4; + int len = lookup[highbits]; + uint8_t mask = (1 << (8 - len)) - 1; + uint32_t value = first_byte & mask; + const char * end = src + len; // may overrun! + const char * pos = src + 1; + for ( ; pos < end && *pos; pos++) { + value = (value << 6) + (static_cast(*pos) & 0x3F); + } + return std::make_pair(value, pos); + } + + static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { + uint32_t next_id = static_cast(state.symbol_ids.size()); + auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); + return result.first->second; + } + + static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { + uint32_t next_id = static_cast(state.symbol_ids.size()); + state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; + return next_id; + } + + static void add_rule( + parse_state & state, + uint32_t rule_id, + const std::vector & rule) { + if (state.rules.size() <= rule_id) { + state.rules.resize(rule_id + 1); + } + state.rules[rule_id] = rule; + } + + static bool is_word_char(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); + } + + static std::pair parse_hex(const char * src, int size) { + const char * pos = src; + const char * end = src + size; + uint32_t value = 0; + for ( ; pos < end && *pos; pos++) { + value <<= 4; + char c = *pos; + if ('a' <= c && c <= 'f') { + value += c - 'a' + 10; + } else if ('A' <= c && c <= 'F') { + value += c - 'A' + 10; + } else if ('0' <= c && c <= '9') { + value += c - '0'; + } else { + break; + } + } + if (pos != end) { + throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src); + } + return std::make_pair(value, pos); + } + + static const char * parse_space(const char * src, bool newline_ok) { + const char * pos = src; + while (*pos == ' ' || *pos == '\t' || *pos == '#' || + (newline_ok && (*pos == '\r' || *pos == '\n'))) { + if (*pos == '#') { + while (*pos && *pos != '\r' && *pos != '\n') { + pos++; + } + } else { + pos++; + } + } + return pos; + } + + static const char * parse_name(const char * src) { + const char * pos = src; + while (is_word_char(*pos)) { + pos++; + } + if (pos == src) { + throw std::runtime_error(std::string("expecting name at ") + src); + } + return pos; + } + + static std::pair parse_char(const char * src) { + if (*src == '\\') { + switch (src[1]) { + case 'x': return parse_hex(src + 2, 2); + case 'u': return parse_hex(src + 2, 4); + case 'U': return parse_hex(src + 2, 8); + case 't': return std::make_pair('\t', src + 2); + case 'r': return std::make_pair('\r', src + 2); + case 'n': return std::make_pair('\n', src + 2); + case '\\': + case '"': + case '[': + case ']': + return std::make_pair(src[1], src + 2); + default: + throw std::runtime_error(std::string("unknown escape at ") + src); + } + } else if (*src) { + return decode_utf8(src); + } + throw std::runtime_error("unexpected end of input"); + } + + const char * parse_alternates( + parse_state & state, + const char * src, + const std::string & rule_name, + uint32_t rule_id, + bool is_nested); + + static const char * parse_sequence( + parse_state & state, + const char * src, + const std::string & rule_name, + std::vector & out_elements, + bool is_nested) { + size_t last_sym_start = out_elements.size(); + const char * pos = src; + while (*pos) { + if (*pos == '"') { // literal string + pos++; + last_sym_start = out_elements.size(); + while (*pos != '"') { + auto char_pair = parse_char(pos); + pos = char_pair.second; + out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); + } + pos = parse_space(pos + 1, is_nested); + } else if (*pos == '[') { // char range(s) + pos++; + enum llama_gretype start_type = LLAMA_GRETYPE_CHAR; + if (*pos == '^') { + pos++; + start_type = LLAMA_GRETYPE_CHAR_NOT; + } + last_sym_start = out_elements.size(); + while (*pos != ']') { + auto char_pair = parse_char(pos); + pos = char_pair.second; + enum llama_gretype type = last_sym_start < out_elements.size() + ? LLAMA_GRETYPE_CHAR_ALT + : start_type; + + out_elements.push_back({type, char_pair.first}); + if (pos[0] == '-' && pos[1] != ']') { + auto endchar_pair = parse_char(pos + 1); + pos = endchar_pair.second; + out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); + } + } + pos = parse_space(pos + 1, is_nested); + } else if (is_word_char(*pos)) { // rule reference + const char * name_end = parse_name(pos); + uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos); + pos = parse_space(name_end, is_nested); + last_sym_start = out_elements.size(); + out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); + } else if (*pos == '(') { // grouping + // parse nested alternates into synthesized rule + pos = parse_space(pos + 1, true); + uint32_t sub_rule_id = generate_symbol_id(state, rule_name); + pos = parse_alternates(state, pos, rule_name, sub_rule_id, true); + last_sym_start = out_elements.size(); + // output reference to synthesized rule + out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); + if (*pos != ')') { + throw std::runtime_error(std::string("expecting ')' at ") + pos); + } + pos = parse_space(pos + 1, is_nested); + } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator + if (last_sym_start == out_elements.size()) { + throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos); + } + + // apply transformation to previous symbol (last_sym_start to end) according to + // rewrite rules: + // S* --> S' ::= S S' | + // S+ --> S' ::= S S' | S + // S? --> S' ::= S | + uint32_t sub_rule_id = generate_symbol_id(state, rule_name); + std::vector sub_rule; + // add preceding symbol to generated rule + sub_rule.insert( + sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end()); + if (*pos == '*' || *pos == '+') { + // cause generated rule to recurse + sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); + } + // mark start of alternate def + sub_rule.push_back({LLAMA_GRETYPE_ALT, 0}); + if (*pos == '+') { + // add preceding symbol as alternate only for '+' (otherwise empty) + sub_rule.insert( + sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end()); + } + sub_rule.push_back({LLAMA_GRETYPE_END, 0}); + add_rule(state, sub_rule_id, sub_rule); + + // in original rule, replace previous symbol with reference to generated rule + out_elements.resize(last_sym_start); + out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); + + pos = parse_space(pos + 1, is_nested); + } else { + break; + } + } + return pos; + } + + const char * parse_alternates( + parse_state & state, + const char * src, + const std::string & rule_name, + uint32_t rule_id, + bool is_nested) { + std::vector rule; + const char * pos = parse_sequence(state, src, rule_name, rule, is_nested); + while (*pos == '|') { + rule.push_back({LLAMA_GRETYPE_ALT, 0}); + pos = parse_space(pos + 1, true); + pos = parse_sequence(state, pos, rule_name, rule, is_nested); + } + rule.push_back({LLAMA_GRETYPE_END, 0}); + add_rule(state, rule_id, rule); + return pos; + } + + static const char * parse_rule(parse_state & state, const char * src) { + const char * name_end = parse_name(src); + const char * pos = parse_space(name_end, false); + size_t name_len = name_end - src; + uint32_t rule_id = get_symbol_id(state, src, name_len); + const std::string name(src, name_len); + + if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) { + throw std::runtime_error(std::string("expecting ::= at ") + pos); + } + pos = parse_space(pos + 3, true); + + pos = parse_alternates(state, pos, name, rule_id, false); + + if (*pos == '\r') { + pos += pos[1] == '\n' ? 2 : 1; + } else if (*pos == '\n') { + pos++; + } else if (*pos) { + throw std::runtime_error(std::string("expecting newline or end at ") + pos); + } + return parse_space(pos, true); + } + + parse_state parse(const char * src) { + try { + parse_state state; + const char * pos = parse_space(src, true); + while (*pos) { + pos = parse_rule(state, pos); + } + return state; + } catch (const std::exception & err) { + fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); + return parse_state(); + } + } + + static void print_grammar_char(FILE * file, uint32_t c) { + if (0x20 <= c && c <= 0x7f) { + fprintf(file, "%c", static_cast(c)); + } else { + // cop out of encoding UTF-8 + fprintf(file, "", c); + } + } + + static bool is_char_element(llama_grammar_element elem) { + switch (elem.type) { + case LLAMA_GRETYPE_CHAR: return true; + case LLAMA_GRETYPE_CHAR_NOT: return true; + case LLAMA_GRETYPE_CHAR_ALT: return true; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; + default: return false; + } + } + + static void print_rule_binary(FILE * file, const std::vector & rule) { + for (auto elem : rule) { + switch (elem.type) { + case LLAMA_GRETYPE_END: fprintf(file, "END"); break; + case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break; + case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break; + case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break; + case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break; + case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break; + } + switch (elem.type) { + case LLAMA_GRETYPE_END: + case LLAMA_GRETYPE_ALT: + case LLAMA_GRETYPE_RULE_REF: + fprintf(file, "(%u) ", elem.value); + break; + case LLAMA_GRETYPE_CHAR: + case LLAMA_GRETYPE_CHAR_NOT: + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + case LLAMA_GRETYPE_CHAR_ALT: + fprintf(file, "(\""); + print_grammar_char(file, elem.value); + fprintf(file, "\") "); + break; + } + } + fprintf(file, "\n"); + } + + static void print_rule( + FILE * file, + uint32_t rule_id, + const std::vector & rule, + const std::map & symbol_id_names) { + if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) { + throw std::runtime_error( + "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id)); + } + fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str()); + for (size_t i = 0, end = rule.size() - 1; i < end; i++) { + llama_grammar_element elem = rule[i]; + switch (elem.type) { + case LLAMA_GRETYPE_END: + throw std::runtime_error( + "unexpected end of rule: " + std::to_string(rule_id) + "," + + std::to_string(i)); + case LLAMA_GRETYPE_ALT: + fprintf(file, "| "); + break; + case LLAMA_GRETYPE_RULE_REF: + fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str()); + break; + case LLAMA_GRETYPE_CHAR: + fprintf(file, "["); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_NOT: + fprintf(file, "[^"); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + if (i == 0 || !is_char_element(rule[i - 1])) { + throw std::runtime_error( + "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " + + std::to_string(rule_id) + "," + std::to_string(i)); + } + fprintf(file, "-"); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_ALT: + if (i == 0 || !is_char_element(rule[i - 1])) { + throw std::runtime_error( + "LLAMA_GRETYPE_CHAR_ALT without preceding char: " + + std::to_string(rule_id) + "," + std::to_string(i)); + } + print_grammar_char(file, elem.value); + break; + } + if (is_char_element(elem)) { + switch (rule[i + 1].type) { + case LLAMA_GRETYPE_CHAR_ALT: + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + break; + default: + fprintf(file, "] "); + } + } + } + fprintf(file, "\n"); + } + + void print_grammar(FILE * file, const parse_state & state) { + try { + std::map symbol_id_names; + for (auto kv : state.symbol_ids) { + symbol_id_names[kv.second] = kv.first; + } + for (size_t i = 0, end = state.rules.size(); i < end; i++) { + // fprintf(file, "%zu: ", i); + // print_rule_binary(file, state.rules[i]); + print_rule(file, uint32_t(i), state.rules[i], symbol_id_names); + // fprintf(file, "\n"); + } + } catch (const std::exception & err) { + fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what()); + } + } + + std::vector parse_state::c_rules() { + std::vector ret; + ret.reserve(rules.size()); + for (const auto & rule : rules) { + ret.push_back(rule.data()); + } + return ret; + } +} diff --git a/common/grammar-parser.h b/common/grammar-parser.h new file mode 100644 index 0000000000000000000000000000000000000000..9037d72728a42ed772f384f3d7ddcef01d0d15f5 --- /dev/null +++ b/common/grammar-parser.h @@ -0,0 +1,29 @@ +// Implements a parser for an extended Backus-Naur form (BNF), producing the +// binary context-free grammar format specified by llama.h. Supports character +// ranges, grouping, and repetition operators. As an example, a grammar for +// arithmetic might look like: +// +// root ::= expr +// expr ::= term ([-+*/] term)* +// term ::= num | "(" space expr ")" space +// num ::= [0-9]+ space +// space ::= [ \t\n]* + +#pragma once +#include "llama.h" +#include +#include +#include +#include + +namespace grammar_parser { + struct parse_state { + std::map symbol_ids; + std::vector> rules; + + std::vector c_rules(); + }; + + parse_state parse(const char * src); + void print_grammar(FILE * file, const parse_state & state); +} diff --git a/common/log.h b/common/log.h new file mode 100644 index 0000000000000000000000000000000000000000..18f3b9761a7880cf7ff9b23a31d5f91dd945a3b2 --- /dev/null +++ b/common/log.h @@ -0,0 +1,643 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +// -------------------------------- +// +// Basic usage: +// +// -------- +// +// The LOG() and LOG_TEE() macros are ready to go by default +// they do not require any initialization. +// +// LOGLN() and LOG_TEELN() are variants which automatically +// include \n character at the end of the log string. +// +// LOG() behaves exactly like printf, by default writing to a logfile. +// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ). +// +// Default logfile is named +// "llama..log" +// Default LOG_TEE() secondary output target is +// stderr +// +// Logs can be dynamically disabled or enabled using functions: +// log_disable() +// and +// log_enable() +// +// A log target can be changed with: +// log_set_target( string ) +// creating and opening, or re-opening a file by string filename +// or +// log_set_target( FILE* ) +// allowing to point at stderr, stdout, or any valid FILE* file handler. +// +// -------- +// +// End of Basic usage. +// +// -------------------------------- + +// Specifies a log target. +// default uses log_handler() with "llama.log" log file +// this can be changed, by defining LOG_TARGET +// like so: +// +// #define LOG_TARGET (a valid FILE*) +// #include "log.h" +// +// or it can be simply redirected to stdout or stderr +// like so: +// +// #define LOG_TARGET stderr +// #include "log.h" +// +// The log target can also be redirected to a diffrent function +// like so: +// +// #define LOG_TARGET log_handler_diffrent() +// #include "log.h" +// +// FILE* log_handler_diffrent() +// { +// return stderr; +// } +// +// or: +// +// #define LOG_TARGET log_handler_another_one("somelog.log") +// #include "log.h" +// +// FILE* log_handler_another_one(char*filename) +// { +// static FILE* logfile = nullptr; +// (...) +// if( !logfile ) +// { +// fopen(...) +// } +// (...) +// return logfile +// } +// +#ifndef LOG_TARGET + #define LOG_TARGET log_handler() +#endif + +#ifndef LOG_TEE_TARGET + #define LOG_TEE_TARGET stderr +#endif + +// Utility to obtain "pid" like unique process id and use it when creating log files. +inline std::string log_get_pid() +{ + static std::string pid; + if (pid.empty()) + { + // std::this_thread::get_id() is the most portable way of obtaining a "process id" + // it's not the same as "pid" but is unique enough to solve multiple instances + // trying to write to the same log. + std::stringstream ss; + ss << std::this_thread::get_id(); + pid = ss.str(); + } + + return pid; +} + +// Utility function for generating log file names with unique id based on thread id. +// invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" +// where the number is a runtime id of the current thread. + +#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension) + +// INTERNAL, DO NOT USE +inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension) +{ + std::stringstream buf; + + buf << log_file_basename; + buf << "."; + buf << log_get_pid(); + buf << "."; + buf << log_file_extension; + + return buf.str(); +} + +#ifndef LOG_DEFAULT_FILE_NAME + #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log") +#endif + +// Utility for turning #define values into string literals +// so we can have a define for stderr and +// we can print "stderr" instead of literal stderr, etc. +#define LOG_STRINGIZE1(s) #s +#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s) + +#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET) + +// Allows disabling timestamps. +// in order to disable, define LOG_NO_TIMESTAMPS +// like so: +// +// #define LOG_NO_TIMESTAMPS +// #include "log.h" +// +#ifndef LOG_NO_TIMESTAMPS + #ifndef _MSC_VER + #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #else + #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #endif +#else + #define LOG_TIMESTAMP_FMT "%s" + #define LOG_TIMESTAMP_VAL ,"" +#endif + +#ifdef LOG_TEE_TIMESTAMPS + #ifndef _MSC_VER + #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #else + #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " + #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() + #endif +#else + #define LOG_TEE_TIMESTAMP_FMT "%s" + #define LOG_TEE_TIMESTAMP_VAL ,"" +#endif + +// Allows disabling file/line/function prefix +// in order to disable, define LOG_NO_FILE_LINE_FUNCTION +// like so: +// +// #define LOG_NO_FILE_LINE_FUNCTION +// #include "log.h" +// +#ifndef LOG_NO_FILE_LINE_FUNCTION + #ifndef _MSC_VER + #define LOG_FLF_FMT "[%24s:%5d][%24s] " + #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #else + #define LOG_FLF_FMT "[%24s:%5ld][%24s] " + #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #endif +#else + #define LOG_FLF_FMT "%s" + #define LOG_FLF_VAL ,"" +#endif + +#ifdef LOG_TEE_FILE_LINE_FUNCTION + #ifndef _MSC_VER + #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " + #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #else + #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " + #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ + #endif +#else + #define LOG_TEE_FLF_FMT "%s" + #define LOG_TEE_FLF_VAL ,"" +#endif + +// Utility for synchronizing log configuration state +// since std::optional was introduced only in c++17 +enum LogTriState +{ + LogTriStateSame, + LogTriStateFalse, + LogTriStateTrue +}; + +// INTERNAL, DO NOT USE +// USE LOG() INSTEAD +// +#ifndef _MSC_VER + #define LOG_IMPL(str, ...) \ + { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + } +#else + #define LOG_IMPL(str, ...) \ + { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + } +#endif + +// INTERNAL, DO NOT USE +// USE LOG_TEE() INSTEAD +// +#ifndef _MSC_VER + #define LOG_TEE_IMPL(str, ...) \ + { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ + { \ + fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ + fflush(LOG_TEE_TARGET); \ + } \ + } +#else + #define LOG_TEE_IMPL(str, ...) \ + { \ + if (LOG_TARGET != nullptr) \ + { \ + fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TARGET); \ + } \ + if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ + { \ + fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ + fflush(LOG_TEE_TARGET); \ + } \ + } +#endif + +// The '\0' as a last argument, is a trick to bypass the silly +// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro" +// so we can have a single macro which can be called just like printf. + +// Main LOG macro. +// behaves like printf, and supports arguments the exact same way. +// +#ifndef _MSC_VER + #define LOG(...) LOG_IMPL(__VA_ARGS__, "") +#else + #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "") +#endif + +// Main TEE macro. +// does the same as LOG +// and +// simultaneously writes stderr. +// +// Secondary target can be changed just like LOG_TARGET +// by defining LOG_TEE_TARGET +// +#ifndef _MSC_VER + #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") +#else + #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "") +#endif + +// LOG macro variants with auto endline. +#ifndef _MSC_VER + #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") + #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") +#else + #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n") + #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n") +#endif + +// INTERNAL, DO NOT USE +inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) +{ + static bool _initialized{false}; + static bool _disabled{(filename.empty() && target == nullptr)}; + static std::string log_current_filename{filename}; + static FILE *log_current_target{target}; + static FILE *logfile = nullptr; + + if (change) + { + if (disable == LogTriStateTrue) + { + // Disable primary target + _disabled = true; + } + // If previously disabled, only enable, and keep previous target + else if (disable == LogTriStateFalse) + { + _disabled = false; + } + // Otherwise, process the arguments + else if (log_current_filename != filename || log_current_target != target) + { + _initialized = false; + } + } + + if (_disabled) + { + // Log is disabled + return nullptr; + } + + if (_initialized) + { + // with fallback in case something went wrong + return logfile ? logfile : stderr; + } + + // do the (re)initialization + if (target != nullptr) + { + if (logfile != nullptr && logfile != stdout && logfile != stderr) + { + fclose(logfile); + } + + log_current_filename = LOG_DEFAULT_FILE_NAME; + log_current_target = target; + + logfile = target; + } + else + { + if (log_current_filename != filename) + { + if (logfile != nullptr && logfile != stdout && logfile != stderr) + { + fclose(logfile); + } + } + + logfile = fopen(filename.c_str(), "w"); + } + + if (!logfile) + { + // Verify whether the file was opened, otherwise fallback to stderr + logfile = stderr; + + fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno)); + fflush(stderr); + + // At this point we let the init flag be to true below, and let the target fallback to stderr + // otherwise we would repeatedly fopen() which was already unsuccessful + } + + _initialized = true; + + return logfile ? logfile : stderr; +} + +// INTERNAL, DO NOT USE +inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) +{ + return log_handler1_impl(change, disable, filename, target); +} + +// Disables logs entirely at runtime. +// Makes LOG() and LOG_TEE() produce no output, +// untill enabled back. +#define log_disable() log_disable_impl() + +// INTERNAL, DO NOT USE +inline FILE *log_disable_impl() +{ + return log_handler1_impl(true, LogTriStateTrue); +} + +// Enables logs at runtime. +#define log_enable() log_enable_impl() + +// INTERNAL, DO NOT USE +inline FILE *log_enable_impl() +{ + return log_handler1_impl(true, LogTriStateFalse); +} + +// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) +#define log_set_target(target) log_set_target_impl(target) + +// INTERNAL, DO NOT USE +inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); } +inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); } + +// INTERNAL, DO NOT USE +inline FILE *log_handler() { return log_handler1_impl(); } + +inline void log_test() +{ + log_disable(); + LOG("01 Hello World to nobody, because logs are disabled!\n") + log_enable(); + LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)) + LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n") + log_set_target(stderr); + LOG("04 Hello World to stderr!\n") + LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n") + log_set_target(LOG_DEFAULT_FILE_NAME); + LOG("06 Hello World to default log file!\n") + log_set_target(stdout); + LOG("07 Hello World to stdout!\n") + log_set_target(LOG_DEFAULT_FILE_NAME); + LOG("08 Hello World to default log file again!\n") + log_disable(); + LOG("09 Hello World _1_ into the void!\n") + log_enable(); + LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n") + log_disable(); + log_set_target("llama.anotherlog.log"); + LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n") + log_enable(); + LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n") + log_set_target("llama.yetanotherlog.log"); + LOG("13 Hello World this time in yet new file?\n") + log_set_target(log_filename_generator("llama_autonamed", "log")); + LOG("14 Hello World in log with generated filename!\n") +#ifdef _MSC_VER + LOG_TEE("15 Hello msvc TEE without arguments\n") + LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") + LOG_TEELN("17 Hello msvc TEELN without arguments\n") + LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test") + LOG("19 Hello msvc LOG without arguments\n") + LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test") + LOGLN("21 Hello msvc LOGLN without arguments\n") + LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test") +#endif +} + +inline bool log_param_single_parse(const std::string & param) +{ + if ( param == "--log-test") + { + log_test(); + return true; + } + + if ( param == "--log-disable") + { + log_disable(); + return true; + } + + if ( param == "--log-enable") + { + log_enable(); + return true; + } + + return false; +} + +inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string()) +{ + if ( param == "--log-file") + { + if (!check_but_dont_parse) + { + log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log")); + } + + return true; + } + + return false; +} + +inline void log_print_usage() +{ + printf("log options:\n"); + /* format + printf(" -h, --help show this help message and exit\n");*/ + /* spacing + printf("__-param----------------Description\n");*/ + printf(" --log-test Run simple logging test\n"); + printf(" --log-disable Disable trace logs\n"); + printf(" --log-enable Enable trace logs\n"); + printf(" --log-file Specify a log filename (without extension)\n"); + printf(" Log file will be tagged with unique ID and written as \"..log\"\n"); /* */ +} + +#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) + +// INTERNAL, DO NOT USE +inline void log_dump_cmdline_impl(int argc, char **argv) +{ + std::stringstream buf; + for (int i = 0; i < argc; ++i) + { + if (std::string(argv[i]).find(' ') != std::string::npos) + { + buf << " \"" << argv[i] <<"\""; + } + else + { + buf << " " << argv[i]; + } + } + LOGLN("Cmd:%s", buf.str().c_str()) +} + +#define log_tostr(var) log_var_to_string_impl(var).c_str() + +inline std::string log_var_to_string_impl(bool var) +{ + return var ? "true" : "false"; +} + +inline std::string log_var_to_string_impl(std::string var) +{ + return var; +} + +inline std::string log_var_to_string_impl(const std::vector & var) +{ + std::stringstream buf; + buf << "[ "; + bool first = true; + for (auto e : var) + { + if (first) + { + first = false; + } + else + { + buf << ", "; + } + buf << std::to_string(e); + } + buf << " ]"; + + return buf.str(); +} + +#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \ + [&tokens, &ctx]() \ + { \ + std::stringstream buf; \ + buf << "[ "; \ + \ + bool first = true; \ + for (const auto &token : tokens) \ + { \ + if (!first) \ + buf << ", "; \ + else \ + first = false; \ + \ + auto detokenized = llama_token_to_piece(ctx, token); \ + \ + detokenized.erase( \ + std::remove_if( \ + detokenized.begin(), \ + detokenized.end(), \ + [](const unsigned char c) { return !std::isprint(c); }), \ + detokenized.end()); \ + \ + buf \ + << "'" << detokenized << "'" \ + << ":" << std::to_string(token); \ + } \ + buf << " ]"; \ + \ + return buf.str(); \ + }() \ + .c_str() + +#ifdef LOG_DISABLE_LOGS + +#undef LOG +#define LOG(...) // dummy stub +#undef LOGLN +#define LOGLN(...) // dummy stub + +#undef LOG_TEE +#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf + +#undef LOG_TEELN +#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf + +#undef LOG_DISABLE +#define LOG_DISABLE() // dummy stub + +#undef LOG_ENABLE +#define LOG_ENABLE() // dummy stub + +#undef LOG_ENABLE +#define LOG_ENABLE() // dummy stub + +#undef LOG_SET_TARGET +#define LOG_SET_TARGET(...) // dummy stub + +#undef LOG_DUMP_CMDLINE +#define LOG_DUMP_CMDLINE(...) // dummy stub + +#endif // LOG_DISABLE_LOGS diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..8bd34dc440769b3ec5cf837402bd5d3d0d229a8c --- /dev/null +++ b/convert-baichuan-hf-to-gguf.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +# HF baichuan --> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Any +import itertools +import gguf +import numpy as np +import torch +from sentencepiece import SentencePieceProcessor # type: ignore[import] + + +if TYPE_CHECKING: + from typing import TypeAlias + +NDArray: TypeAlias = 'np.ndarray[Any, Any]' + +# reverse HF permute back to original pth layout + + +def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: + r = weights.shape[0] // 3 + return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) + +def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray: + r = weights.shape[0] // 3 + return weights[r * n_part : r * n_part + r, ...] + +def count_model_parts(dir_model: str) -> int: + num_parts = 0 + + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + + return num_parts + + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) +print("hello print: ",hparams["architectures"][0]) +if hparams["architectures"][0] != "BaichuanForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit() + +# get number of model parts +num_parts = count_model_parts(dir_model) +print(f"num_parts:{num_parts}\n") +ARCH=gguf.MODEL_ARCH.BAICHUAN +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["num_hidden_layers"] +head_count = hparams["num_attention_heads"] + +if "num_key_value_heads" in hparams: + head_count_kv = hparams["num_key_value_heads"] +else: + head_count_kv = head_count + +if "_name_or_path" in hparams: + hf_repo = hparams["_name_or_path"] +else: + hf_repo = "" + +if "max_sequence_length" in hparams: + ctx_length = hparams["max_sequence_length"] +elif "max_position_embeddings" in hparams: + ctx_length = hparams["max_position_embeddings"] +elif "model_max_length" in hparams: + ctx_length = hparams["model_max_length"] +else: + print("gguf: can not find ctx length parameter.") + + sys.exit() + + +gguf_writer.add_name(dir_model.name) +gguf_writer.add_source_hf_repo(hf_repo) +gguf_writer.add_tensor_data_layout("Meta AI original pth") +gguf_writer.add_context_length(ctx_length) +gguf_writer.add_embedding_length(hparams["hidden_size"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) +gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) +gguf_writer.add_head_count(head_count) +gguf_writer.add_head_count_kv(head_count_kv) +gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + +if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: + if "type" in hparams["rope_scaling"]: + if hparams["rope_scaling"]["type"] == "linear": + gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) + + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytes] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +tokenizer_model_file = dir_model / 'tokenizer.model' +if not tokenizer_model_file.is_file(): + print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) + sys.exit(1) + +# vocab type sentencepiece +print("gguf: get sentencepiece tokenizer vocab, scores and token types") + +tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) + +for i in range(tokenizer.vocab_size()): + text: bytes + score: float + + piece = tokenizer.id_to_piece(i) + text = piece.encode("utf-8") + score = tokenizer.get_score(i) + + toktype = 1 # defualt to normal token type + if tokenizer.is_unknown(i): + toktype = 2 + if tokenizer.is_control(i): + toktype = 3 + + # toktype = 4 is user-defined = tokens from added_tokens.json + + if tokenizer.is_unused(i): + toktype = 5 + if tokenizer.is_byte(i): + toktype = 6 + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + +added_tokens_file = dir_model / 'added_tokens.json' +if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + addtokens_json = json.load(f) + + print("gguf: get added tokens") + + for key in addtokens_json: + tokens.append( key.encode("utf-8") ) + scores.append(-1000.0) + toktypes.append(4) # user-defined token type + + +gguf_writer.add_tokenizer_model("llama") +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") + + tmp=model_part + for i in range(block_count): + if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: + print(f"Unpacking and permuting layer {i}") + tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) + tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2) + del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] + + for name in model_part.keys(): + data = model_part[name] + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..88338d823b03585275be7c9f36ba9b4c7e32abd5 --- /dev/null +++ b/convert-falcon-hf-to-gguf.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +# HF falcon--> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def bytes_to_unicode(): + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + return dict(zip(bs, (chr(n) for n in cs))) + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "RWForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.FALCON +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layer"] + +gguf_writer.add_name("Falcon") +gguf_writer.add_context_length(2048) # not in config.json +gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform +gguf_writer.add_embedding_length(hparams["hidden_size"]) +gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(hparams["n_head"]) +if "n_head_kv" in hparams: + gguf_writer.add_head_count_kv(hparams["n_head_kv"]) +else: + gguf_writer.add_head_count_kv(1) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +tokenizer_json_file = dir_model / 'tokenizer.json' +if not tokenizer_json_file.is_file(): + print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) + sys.exit(1) + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +with open(tokenizer_json_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + +print("gguf: get gpt2 tokenizer vocab") + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +byte_encoder = bytes_to_unicode() +byte_decoder = {v: k for k, v in byte_encoder.items()} + +for i in range(vocab_size): + if i in reverse_vocab: + try: + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + except KeyError: + text = bytearray() + for c in reverse_vocab[i]: + if ord(c) < 256: # single byte character + text.append(byte_decoder[ord(c)]) + else: # multibyte special token character + text.extend(c.encode('utf-8')) + else: + print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") + pad_token = f"[PAD{i}]".encode("utf8") + text = bytearray(pad_token) + + tokens.append(text) + scores.append(0.0) # dymmy + toktypes.append(gguf.TokenType.NORMAL) # dummy + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# params for qkv transform +n_head = hparams["n_head"] +n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 + +head_dim = hparams["hidden_size"] // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(dir_model / part_name, map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data = torch.cat((q,k,v)).reshape_as(data) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..782410e44f2d1d92d832c699d749f830e0334434 --- /dev/null +++ b/convert-gptneox-hf-to-gguf.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +# HF gptneox--> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + +# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + return dict(zip(bs, (chr(n) for n in cs))) + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "GPTNeoXForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit() + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.GPTNEOX +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["num_hidden_layers"] + +gguf_writer.add_name(dir_model.name) +gguf_writer.add_context_length(hparams["max_position_embeddings"]) +gguf_writer.add_embedding_length(hparams["hidden_size"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) +gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))) +gguf_writer.add_head_count(hparams["num_attention_heads"]) +gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"]) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] + +tokenizer_json_file = dir_model / 'tokenizer.json' +if not tokenizer_json_file.is_file(): + print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) + sys.exit(1) + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +with open(tokenizer_json_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + +print("gguf: get gpt2 tokenizer vocab") + +vocab_size = len(tokenizer_json["model"]["vocab"]) + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +byte_encoder = bytes_to_unicode() +byte_decoder = {v: k for k, v in byte_encoder.items()} + +for i in range(vocab_size): + if i in reverse_vocab: + try: + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + except KeyError: + text = bytearray() + for c in reverse_vocab[i]: + if ord(c) < 256: # single byte character + text.append(byte_decoder[ord(c)]) + else: # multibyte special token character + text.extend(c.encode('utf-8')) + else: + print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") + pad_token = f"[PAD{i}]".encode("utf8") + text = bytearray(pad_token) + + tokens.append(text) + +gguf_writer.add_token_list(tokens) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + # we don't need these + if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): + continue + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..b5d3e0b3c3acea724c23633a71df498e08fe91ce --- /dev/null +++ b/convert-llama-ggml-to-gguf.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import math +import struct +import sys +from enum import IntEnum +from pathlib import Path + +import numpy as np + +import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + +# Note: Does not support GGML_QKK_64 +QK_K = 256 +# Items here are (block size, type size) +GGML_QUANT_SIZES = { + gguf.GGMLQuantizationType.F32 : (1, 4), + gguf.GGMLQuantizationType.F16 : (1, 2), + gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16), + gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16), + gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16), + gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16), + gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32), + gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32), + gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4), + gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12), + gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12), + gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8), +} + +class GGMLFormat(IntEnum): + GGML = 0 + GGMF = 1 + GGJT = 2 + +class GGMLFType(IntEnum): + ALL_F32 = 0 + MOSTLY_F16 = 1 + MOSTLY_Q4_0 = 2 + MOSTLY_Q4_1 = 3 + MOSTLY_Q4_1_SOME_F16 = 4 + MOSTLY_Q8_0 = 7 + MOSTLY_Q5_0 = 8 + MOSTLY_Q5_1 = 9 + MOSTLY_Q2_K = 10 + MOSTLY_Q3_K_S = 11 + MOSTLY_Q3_K_M = 12 + MOSTLY_Q3_K_L = 13 + MOSTLY_Q4_K_S = 14 + MOSTLY_Q4_K_M = 15 + MOSTLY_Q5_K_S = 16 + MOSTLY_Q5_K_M = 17 + MOSTLY_Q6_K = 18 + +class Hyperparameters: + def __init__(self): + self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0 + self.n_layer = self.n_rot = self.n_ff = 0 + self.ftype = GGMLFType.ALL_F32 + + def set_n_ff(self, model): + ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight') + assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor' + ff_tensor = model.tensors[ff_tensor_idx] + self.n_ff = ff_tensor.dims[1] + + def load(self, data, offset): + ( + self.n_vocab, + self.n_embd, + self.n_mult, + self.n_head, + self.n_layer, + self.n_rot, + ftype, + ) = struct.unpack('<7I', data[offset:offset + (4 * 7)]) + try: + self.ftype = GGMLFType(ftype) + except ValueError: + raise ValueError(f'Invalid ftype {ftype}') + return 4 * 7 + + def __str__(self): + return f'' + +class Vocab: + def __init__(self, load_scores = True): + self.items = [] + self.load_scores = load_scores + + def load(self, data, offset, n_vocab): + orig_offset = offset + for _ in range(n_vocab): + itemlen = struct.unpack('= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}' + assert name_len < 4096, 'Absurd tensor name length' + quant = GGML_QUANT_SIZES.get(dtype) + assert quant is not None, 'Unknown tensor type' + (blksize, tysize) = quant + offset += 12 + self.dtype= dtype + self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)]) + offset += 4 * n_dims + self.name = bytes(data[offset:offset + name_len]) + offset += name_len + pad = ((offset + 31) & ~31) - offset if self.use_padding else 0 + offset += pad + n_elems = np.prod(self.dims) + n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize) + self.start_offset = offset + self.len_bytes = n_bytes + offset += n_bytes + # print(n_dims, name_len, dtype, self.dims, self.name, pad) + return offset - orig_offset + +class GGMLModel: + def __init__(self): + self.hyperparameters = None + self.vocab = None + self.tensor_map = {} + self.tensors = [] + + def validate_header(self, data, offset): + magic = bytes(data[offset:offset + 4]) + if magic == b'GGUF': + raise ValueError('File is already in GGUF format.') + if magic == b'lmgg': + self.file_format = GGMLFormat.GGML + self.format_version = 1 + return 4 + version = struct.unpack(' 3: + raise ValueError(f'Cannot handle unexpected GGJT file version {version}') + self.file_format = GGMLFormat.GGJT + self.format_version = version + return 8 + raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.") + + def validate_conversion(self, ftype): + err = '' + if (self.file_format < GGMLFormat.GGJT or self.format_version < 2): + if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16): + err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.' + elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2): + if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, + GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): + err = 'Q4 and Q8 quantizations changed in GGJTv3.' + if len(err) > 0: + raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.') + + def load(self, data, offset): + offset += self.validate_header(data, offset) + hp = Hyperparameters() + offset += hp.load(data, offset) + print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') + self.validate_conversion(hp.ftype) + vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML) + offset += vocab.load(data, offset, hp.n_vocab) + tensors: list[Tensor] = [] + tensor_map = {} + while offset < len(data): + tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF) + offset += tensor.load(data, offset) + tensor_map[tensor.name] = len(tensors) + tensors.append(tensor) + self.hyperparameters = hp + self.vocab = vocab + self.tensors = tensors + self.tensor_map = tensor_map + hp.set_n_ff(self) + return offset + +class GGMLToGGUF: + def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None): + hp = ggml_model.hyperparameters + self.model = ggml_model + self.data = data + self.cfg = cfg + self.params_override = params_override + self.vocab_override = vocab_override + self.special_vocab = special_vocab + if params_override is not None: + n_kv_head = params_override.n_head_kv + else: + if cfg.gqa == 1: + n_kv_head = hp.n_head + else: + gqa = float(cfg.gqa) + n_kv_head = None + for x in range(1, 256): + if float(hp.n_head) / float(x) == gqa: + n_kv_head = x + assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param" + print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') + self.n_kv_head = n_kv_head + self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer) + + def save(self): + print('* Preparing to save GGUF file') + gguf_writer = gguf.GGUFWriter( + self.cfg.output, + gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], + use_temp_file = False ) + self.add_params(gguf_writer) + self.add_vocab(gguf_writer) + if self.special_vocab is not None: + self.special_vocab.add_to_gguf(gguf_writer) + self.add_tensors(gguf_writer) + print(" gguf: write header") + gguf_writer.write_header_to_file() + print(" gguf: write metadata") + gguf_writer.write_kv_data_to_file() + print(" gguf: write tensors") + gguf_writer.write_tensors_to_file() + gguf_writer.close() + + def add_params(self, gguf_writer): + hp = self.model.hyperparameters + cfg = self.cfg + if cfg.desc is not None: + desc = cfg.desc + else: + desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format' + try: + # Filenames aren't necessarily valid UTF8. + name = cfg.name if cfg.name is not None else cfg.input.name + except UnicodeDecodeError: + name = None + print('* Adding model parameters and KV items') + if name is not None: + gguf_writer.add_name(name) + gguf_writer.add_description(desc) + gguf_writer.add_file_type(int(hp.ftype)) + if self.params_override is not None: + po = self.params_override + assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch' + assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch' + assert po.n_head == hp.n_head, 'Model hyperparams mismatch' + gguf_writer.add_context_length (po.n_ctx) + gguf_writer.add_embedding_length (po.n_embd) + gguf_writer.add_block_count (po.n_layer) + gguf_writer.add_feed_forward_length (po.n_ff) + gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head) + gguf_writer.add_head_count (po.n_head) + gguf_writer.add_head_count_kv (po.n_head_kv) + gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps) + return + gguf_writer.add_context_length(cfg.context_length) + gguf_writer.add_embedding_length(hp.n_embd) + gguf_writer.add_block_count(hp.n_layer) + gguf_writer.add_feed_forward_length(hp.n_ff) + gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head) + gguf_writer.add_head_count(hp.n_head) + gguf_writer.add_head_count_kv(self.n_kv_head) + gguf_writer.add_layer_norm_rms_eps(float(cfg.eps)) + + def add_vocab(self, gguf_writer): + hp = self.model.hyperparameters + gguf_writer.add_tokenizer_model('llama') + tokens = [] + scores = [] + toktypes = [] + if self.vocab_override is not None: + vo = self.vocab_override + print('* Adding vocab item(s)') + for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): + tokens.append(vbytes) + scores.append(score) + toktypes.append(ttype) + assert len(tokens) == hp.n_vocab, \ + f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' + gguf_writer.add_token_list(tokens) + gguf_writer.add_token_scores(scores) + if len(toktypes) > 0: + gguf_writer.add_token_types(toktypes) + return + print(f'* Adding {hp.n_vocab} vocab item(s)') + assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' + for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): + tt = 1 # Normal + # Special handling for UNK, BOS, EOS tokens. + if tokid <= 2: + if tokid == 0: + vbytes = b'' + tt = 2 + elif tokid == 1: + vbytes = b'' + tt = 3 + else: + vbytes = b'' + tt = 3 + elif len(vbytes) == 0: + tt = 3 # Control + elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1: + vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8') + tt = 6 # Byte + else: + vbytes = vbytes.replace(b' ', b'\xe2\x96\x81') + toktypes.append(tt) + tokens.append(vbytes) + scores.append(vscore) + gguf_writer.add_token_list(tokens) + gguf_writer.add_token_scores(scores) + gguf_writer.add_token_types(toktypes) + gguf_writer.add_unk_token_id(0) + gguf_writer.add_bos_token_id(1) + gguf_writer.add_eos_token_id(2) + + def add_tensors(self, gguf_writer): + tensor_map = self.name_map + data = self.data + print(f'* Adding {len(self.model.tensors)} tensor(s)') + for tensor in self.model.tensors: + name = str(tensor.name, 'UTF-8') + mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + assert mapped_name is not None, f'Bad name {name}' + tempdims = list(tensor.dims[:]) + if len(tempdims) > 1: + temp = tempdims[1] + tempdims[1] = tempdims[0] + tempdims[0] = temp + # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}') + gguf_writer.add_tensor( + mapped_name, + data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], + raw_shape = tempdims, + raw_dtype = tensor.dtype ) + +def handle_metadata(cfg, hp): + import convert + assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory' + hf_config_path = cfg.model_metadata_dir / "config.json" + orig_config_path = cfg.model_metadata_dir / "params.json" + # We pass a fake model here. "original" mode will check the shapes of some + # tensors if information is missing in the .json file: other than that, the + # model data isn't used so this should be safe (at least for now). + fakemodel = { + 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor), + 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor), + } + fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab] + fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff] + if hf_config_path.exists(): + params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path) + elif orig_config_path.exists(): + params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) + else: + raise ValueError('Unable to load metadata') + vocab = convert.load_vocab( + cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, + cfg.vocabtype ) + # FIXME: Respect cfg.vocab_dir? + svocab = gguf.SpecialVocab(cfg.model_metadata_dir) + convert.check_vocab_size(params, vocab) + return (params, vocab, svocab) + +def handle_args(): + parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF') + parser.add_argument('--input', '-i', type = Path, required = True, + help = 'Input GGMLv3 filename') + parser.add_argument('--output', '-o', type = Path, required = True, + help ='Output GGUF filename') + parser.add_argument('--name', + help = 'Set model name') + parser.add_argument('--desc', + help = 'Set model description') + parser.add_argument('--gqa', type = int, default = 1, + help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') + parser.add_argument('--eps', default = '5.0e-06', + help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') + parser.add_argument('--context-length', '-c', type=int, default = 2048, + help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') + parser.add_argument('--model-metadata-dir', '-m', type = Path, + help ='Load HuggingFace/.pth vocab and metadata from the specified directory') + parser.add_argument("--vocab-dir", type=Path, + help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") + parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", + help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") + return parser.parse_args() + +def main(): + cfg = handle_args() + print(f'* Using config: {cfg}') + print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') + if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'): + print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') + data = np.memmap(cfg.input, mode = 'r') + model = GGMLModel() + print('* Scanning GGML input file') + offset = model.load(data, 0) + print(f'* GGML model hyperparameters: {model.hyperparameters}') + vocab_override = None + params_override = None + special_vocab = None + if cfg.model_metadata_dir is not None: + (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters) + print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') + print(f'* Overriding params: {params_override}') + print(f'* Overriding vocab: {vocab_override}') + print(f'* Special vocab: {special_vocab}') + else: + print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') + if model.file_format == GGMLFormat.GGML: + print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') + converter = GGMLToGGUF(model, data, cfg, + params_override = params_override, + vocab_override = vocab_override, + special_vocab = special_vocab ) + converter.save() + print(f'* Successful completion. Output saved to: {cfg.output}') + +if __name__ == '__main__': + main() diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index b4999ff5a07c866b80f601d03fc7fe910ebdb7a3..a937410dd8a9f3ad6c216617abb4bd661a161555 100644 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -1,28 +1,29 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +from __future__ import annotations + import json import os import re import struct import sys -from typing import Any, Dict, Sequence, TextIO +from typing import Any, BinaryIO, Sequence +import numpy as np import torch -from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType +NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} + HF_SUBLAYER_TO_GGML = { - "self_attn.q_proj": "attention.wq", - "self_attn.k_proj": "attention.wk", - "self_attn.v_proj": "attention.wv", - "self_attn.o_proj": "attention.wo", - "mlp.gate_proj": "feed_forward.w1", - "mlp.down_proj": "feed_forward.w2", - "mlp.up_proj": "feed_forward.w3", - "input_layernorm": "attention_norm", + "self_attn.q_proj": "attn_q", + "self_attn.k_proj": "attn_k", + "self_attn.v_proj": "attn_v", + "self_attn.o_proj": "attn_output", + "mlp.gate_proj": "ffn_gate", + "mlp.down_proj": "ffn_down", + "mlp.up_proj": "ffn_up", + "input_layernorm": "attn_norm", "post_attention_layernorm": "ffn_norm", - # "norm": "norm", - # "embed_tokens": "tok_embeddings", - # "lm_head": "output", } @@ -39,7 +40,7 @@ def translate_tensor_name(t: str) -> str: sys.exit(1) output_string = ( - f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" + f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" ) return output_string else: @@ -47,19 +48,21 @@ def translate_tensor_name(t: str) -> str: sys.exit(1) -def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None: +def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", params["r"])) # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int # but some models ship a float value instead # let's convert to int, but fail if lossless conversion is not possible - assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly" + assert ( + int(params["lora_alpha"]) == params["lora_alpha"] + ), "cannot convert float to int losslessly" fout.write(struct.pack("i", int(params["lora_alpha"]))) def write_tensor_header( - self, name: str, shape: Sequence[int], data_type: DataType + self, name: str, shape: Sequence[int], data_type: np.dtype[Any] ) -> None: sname = name.encode("utf-8") fout.write( @@ -67,7 +70,7 @@ def write_tensor_header( "iii", len(shape), len(sname), - DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]], + NUMPY_TYPE_TO_FTYPE[data_type.name], ) ) fout.write(struct.pack("i" * len(shape), *shape[::-1])) diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..331e84e985a2f1fd2e11423101777364962123de --- /dev/null +++ b/convert-starcoder-hf-to-gguf.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# HF starcoder --> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def bytes_to_unicode(): + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + return dict(zip(bs, (chr(n) for n in cs))) + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "GPTBigCodeForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.STARCODER +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layer"] + +gguf_writer.add_name("StarCoder") +gguf_writer.add_context_length(hparams["n_positions"]) +gguf_writer.add_embedding_length(hparams["n_embd"]) +gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(hparams["n_head"]) +gguf_writer.add_head_count_kv(1) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +tokenizer_json_file = dir_model / 'tokenizer.json' +if not tokenizer_json_file.is_file(): + print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) + sys.exit(1) + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +with open(tokenizer_json_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) + +print("gguf: get gpt2 tokenizer vocab") + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"]) + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +byte_encoder = bytes_to_unicode() +byte_decoder = {v: k for k, v in byte_encoder.items()} + +for i in range(vocab_size): + if i in reverse_vocab: + try: + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + except KeyError: + text = bytearray() + for c in reverse_vocab[i]: + if ord(c) < 256: # single byte character + text.append(byte_decoder[ord(c)]) + else: # multibyte special token character + text.extend(c.encode('utf-8')) + else: + print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") + pad_token = f"[PAD{i}]".encode("utf8") + text = bytearray(pad_token) + + tokens.append(text) + scores.append(0.0) # dymmy + toktypes.append(gguf.TokenType.NORMAL) # dummy + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# params for qkv transform +n_head = hparams["n_head"] +n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 + +head_dim = hparams["n_embd"] // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(dir_model / part_name, map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert.py b/convert.py index f3bf1798089ccd8d4c9f4db85deb313f60d8c175..4ac5030db61eb2ce1cdc99242f4a9100f43bb768 100644 --- a/convert.py +++ b/convert.py @@ -1,4 +1,6 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +from __future__ import annotations + import argparse import concurrent.futures import copy @@ -15,141 +17,151 @@ import re import signal import struct import sys +import time import zipfile from abc import ABCMeta, abstractmethod +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, - Literal, Optional, Sequence, Tuple, TypeVar, Union) +from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar import numpy as np -from sentencepiece import SentencePieceProcessor # type: ignore +from sentencepiece import SentencePieceProcessor # type: ignore[import] + +import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf if TYPE_CHECKING: - from typing_extensions import TypeAlias + from typing import TypeAlias if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) -NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' +NDArray: TypeAlias = 'np.ndarray[Any, Any]' +ARCH=gguf.MODEL_ARCH.LLAMA +NAMES=gguf.MODEL_TENSOR_NAMES[ARCH] + +DEFAULT_CONCURRENCY = 8 +# +# data types +# @dataclass(frozen=True) -class UnquantizedDataType: +class DataType: name: str + dtype: np.dtype[Any] + valid_conversions: list[str] - -DT_F16 = UnquantizedDataType('F16') -DT_F32 = UnquantizedDataType('F32') -DT_I32 = UnquantizedDataType('I32') -DT_BF16 = UnquantizedDataType('BF16') - + def elements_to_bytes(self, n_elements: int) -> int: + return n_elements * self.dtype.itemsize @dataclass(frozen=True) -class QuantizedDataType: - groupsize: int - have_addends: bool - have_g_idx: bool - +class UnquantizedDataType(DataType): + pass -DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False) -DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False) +DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) +DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) +DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) +DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) -DataType = Union[UnquantizedDataType, QuantizedDataType] +@dataclass(frozen=True) +class QuantizedDataType(DataType): + block_size: int + quantized_dtype: np.dtype[Any] + ggml_type: gguf.GGMLQuantizationType -DATA_TYPE_TO_FTYPE: Dict[DataType, int] = { - DT_F32: 0, - DT_F16: 1, - DT_Q4_0: 2, - DT_Q4_1: 3, -} + def quantize(self, arr: NDArray) -> NDArray: + raise NotImplementedError(f'Quantization for {self.name} not implemented') -FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \ - {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()} + def elements_to_bytes(self, n_elements: int) -> int: + assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' + return self.quantized_dtype.itemsize * (n_elements // self.block_size) -DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = { - DT_BF16: np.dtype(np.uint16), - DT_F16: np.dtype(np.float16), - DT_F32: np.dtype(np.float32), - DT_I32: np.dtype(np.int32), +@dataclass(frozen=True) +class Q8_0QuantizedDataType(QuantizedDataType): + # Mini Q8_0 quantization in Python! + def quantize(self, arr: NDArray) -> NDArray: + assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}' + assert arr.dtype == np.float32, f'Bad array type {arr.dtype}' + n_blocks = arr.size // self.block_size + blocks = arr.reshape((n_blocks, self.block_size)) + # Much faster implementation of block quantization contributed by @Cebtenzzre + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: + d = abs(blocks).max(axis = 1) / np.float32(127) + with np.errstate(divide = 'ignore'): + qs = (blocks / d[:, None]).round() + qs[d == 0] = 0 + yield from zip(d, qs) + return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) + +DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', + dtype = np.dtype(np.float32), valid_conversions = [], + ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, + quantized_dtype = np.dtype([('d', ' DataType: - if len(tensor.shape) == 1: - # 1D tensors are always F32. - return DT_F32 - elif self == GGMLFileType.AllF32: - return DT_F32 - elif self == GGMLFileType.MostlyF16: - return DT_F16 - elif self == GGMLFileType.MostlyQ4_0: - return DT_Q4_0 - elif self == GGMLFileType.MostlyQ4_1: - return DT_Q4_1 - elif self == GGMLFileType.PerLayerIsQ4_1: - if name in ('output.weight', 'tok_embeddings.weight'): - return DT_F16 - else: - return DT_Q4_1 - else: +# TODO: match this with `llama_ftype` +# TODO: rename to LLAMAFileType +# TODO: move to `gguf.py` +class GGMLFileType(enum.IntEnum): + AllF32 = 0 + MostlyF16 = 1 # except 1d tensors + MostlyQ8_0 = 7 # except 1d tensors + + def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: + dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self) + if dt is None: raise ValueError(self) + # 1D tensors are always F32. + return dt if len(tensor.shape) > 1 else DT_F32 +GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { + GGMLFileType.AllF32 : DT_F32, + GGMLFileType.MostlyF16 : DT_F16, + GGMLFileType.MostlyQ8_0: DT_Q8_0, +} -def make_tensors_list() -> List[str]: - ret = [ - 'tok_embeddings.weight', - 'norm.weight', - 'output.weight', - ] - for i in range(80): # maximum number of layer - ret += [ - f'layers.{i}.attention.wq.weight', - f'layers.{i}.attention.wk.weight', - f'layers.{i}.attention.wv.weight', - f'layers.{i}.attention.wo.weight', - f'layers.{i}.attention_norm.weight', - f'layers.{i}.feed_forward.w1.weight', - f'layers.{i}.feed_forward.w2.weight', - f'layers.{i}.feed_forward.w3.weight', - f'layers.{i}.ffn_norm.weight', - ] - return ret +# +# hparams loading +# +@dataclass +class Params: + n_vocab: int + n_embd: int + n_layer: int + n_ctx: int + n_ff: int + n_head: int + n_head_kv: int + f_norm_eps: float -TENSORS_LIST = make_tensors_list() -TENSORS_SET = set(TENSORS_LIST) + f_rope_freq_base: float | None = None + f_rope_scale: float | None = None + ftype: GGMLFileType | None = None -def find_n_mult(n_ff: int, n_embd: int) -> int: - # hardcoded magic range - for n_mult in range(8192, 1, -1): - calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult - if calc_ff == n_ff: - return n_mult - raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).") - -@dataclass -class Params: - n_vocab: int - n_embd: int - n_mult: int - n_head: int - n_layer: int - n_kv_head: Optional[int] # This parameter is only used for Llama 2 + # path to the directory containing the model files + path_model: Path | None = None @staticmethod - def guessed(model: 'LazyModel') -> 'Params': + def guessed(model: LazyModel) -> Params: # try transformer naming first n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape @@ -165,65 +177,110 @@ class Params: raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - n_head=n_embd // 128 # guessed + n_head = n_embd // 128 # guessed + n_mult = 256 # guessed + + # TODO: verify this + n_ff = int(2 * (4 * n_embd) / 3) + n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult) return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = 256, - n_head = n_head, - n_layer = n_layer, - n_kv_head = None, + n_vocab = n_vocab, + n_embd = n_embd, + n_layer = n_layer, + n_ctx = -1, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head, + f_norm_eps = 1e-5, ) @staticmethod - def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_vocab = config["vocab_size"]; - n_embd = config["hidden_size"]; - n_head = config["num_attention_heads"]; - n_layer = config["num_hidden_layers"]; - n_ff = config["intermediate_size"]; - n_kv_head = config.get("num_key_value_heads") + n_vocab = config["vocab_size"] + n_embd = config["hidden_size"] + n_layer = config["num_hidden_layers"] + n_ff = config["intermediate_size"] + n_head = config["num_attention_heads"] + n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head + f_norm_eps = config["rms_norm_eps"] + f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None + + rope_scaling = config.get("rope_scaling") + if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear": + f_rope_scale = config["rope_scaling"].get("factor") + else: + f_rope_scale = None - n_mult = find_n_mult(n_ff, n_embd); + if "max_sequence_length" in config: + n_ctx = config["max_sequence_length"] + elif "max_position_embeddings" in config: + n_ctx = config["max_position_embeddings"] + else: + raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" + "Suggestion: provide 'config.json' of the model in the same directory containing model files.") return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = n_mult, - n_head = n_head, - n_layer = n_layer, - n_kv_head = n_kv_head, + n_vocab = n_vocab, + n_embd = n_embd, + n_layer = n_layer, + n_ctx = n_ctx, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head_kv, + f_norm_eps = f_norm_eps, + f_rope_freq_base = f_rope_freq_base, + f_rope_scale = f_rope_scale, ) # LLaMA v2 70B params.json - # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1 + # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod - def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_vocab = config["vocab_size"]; - n_embd = config["dim"]; - n_head = config["n_heads"]; - n_layer = config["n_layers"]; - n_mult = config["multiple_of"]; + n_vocab = config["vocab_size"] if "vocab_size" in config else -1 + n_embd = config["dim"] + n_layer = config["n_layers"] + n_ff = -1 + n_head = config["n_heads"] + n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head + f_norm_eps = config["norm_eps"] + f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None + + # hack to determine LLaMA v1 vs v2 vs CodeLlama + if f_rope_freq_base == 1000000: + # CodeLlama + n_ctx = 16384 + elif config["norm_eps"] == 1e-05: + # LLaMA v2 + n_ctx = 4096 + else: + # LLaMA v1 + n_ctx = 2048 if n_vocab == -1: n_vocab = model["tok_embeddings.weight"].shape[0] + if n_ff == -1: + n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] + return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = n_mult, - n_head = n_head, - n_layer = n_layer, - n_kv_head = None, + n_vocab = n_vocab, + n_embd = n_embd, + n_layer = n_layer, + n_ctx = n_ctx, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head_kv, + f_norm_eps = f_norm_eps, + f_rope_freq_base = f_rope_freq_base, ) @staticmethod - def load(model_plus: 'ModelPlus') -> 'Params': + def load(model_plus: ModelPlus) -> Params: hf_config_path = model_plus.paths[0].parent / "config.json" orig_config_path = model_plus.paths[0].parent / "params.json" @@ -231,33 +288,104 @@ class Params: params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) elif orig_config_path.exists(): params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) - else: + elif model_plus.format != 'none': params = Params.guessed(model_plus.model) + else: + raise ValueError('Cannot guess params when model format is none') + + params.path_model = model_plus.paths[0].parent - print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}') return params -class SentencePieceVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None: - self.vocabtype = vocabtype - if self.vocabtype == "bpe": - self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read()) +# +# vocab +# + +class BpeVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: + self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) + added_tokens: dict[str, int] + if fname_added_tokens is not None: + # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. + added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: - self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) - added_tokens: Dict[str, int] + # Fall back to trying to find the added tokens in tokenizer.json + tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' + if not tokenizer_json_file.is_file(): + added_tokens = {} + else: + tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) + added_tokens = dict( + (item['content'], item['id']) + for item in tokenizer_json.get('added_tokens', []) + # Added tokens here can be duplicates of the main vocabulary. + if item['content'] not in self.bpe_tokenizer ) + + vocab_size: int = len(self.bpe_tokenizer) + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) + if expected_ids != actual_ids: + expected_end_id = vocab_size + len(actual_ids) - 1 + raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") + + items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base: int = vocab_size + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens + + def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + tokenizer = self.bpe_tokenizer + from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] + byte_encoder = tokenization_gpt2.bytes_to_unicode() + byte_decoder = {v: k for k, v in byte_encoder.items()} + score = 0.0 + for i, item in enumerate(tokenizer): + text: bytes = item.encode("utf-8") + # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior? + if i <= 258 and text.startswith(b'<') and text.endswith(b'>'): + if i == 0 and text == b'': + toktype = gguf.TokenType.UNKNOWN + elif i == 1 or i == 2: + toktype = gguf.TokenType.CONTROL + elif i >= 3 and text.startswith(b'<0x'): + toktype = gguf.TokenType.BYTE + else: + toktype = gguf.TokenType.NORMAL + else: + toktype = gguf.TokenType.NORMAL + yield text, score, toktype + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.bpe_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + + +class SentencePieceVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: + self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) + added_tokens: dict[str, int] if fname_added_tokens is not None: - added_tokens = json.load(open(fname_added_tokens)) + added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: added_tokens = {} - if self.vocabtype == "bpe": - vocab_size: int = len(self.sentencepiece_tokenizer) - else: - vocab_size: int = self.sentencepiece_tokenizer.vocab_size() + + vocab_size: int = self.sentencepiece_tokenizer.vocab_size() expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) + actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size @@ -265,126 +393,74 @@ class SentencePieceVocab: self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens - def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: + def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer - if self.vocabtype == "bpe": - from transformers.models.gpt2 import tokenization_gpt2 - byte_encoder = tokenization_gpt2.bytes_to_unicode() - byte_decoder = {v: k for k, v in byte_encoder.items()} - for i, item in enumerate(tokenizer): - text: bytes - text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]]) - score: float = -i - yield text, score - else: - for i in range(tokenizer.vocab_size()): - text: bytes - if tokenizer.is_unknown(i): - text = " \u2047 ".encode("utf-8") - elif tokenizer.is_control(i): - text = b"" - elif tokenizer.is_byte(i): - piece = tokenizer.id_to_piece(i) - if len(piece) != 6: - raise Exception(f"Invalid token: {piece}") - byte_value = int(piece[3:-1], 16) - text = struct.pack("B", byte_value) - else: - text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") - score: float = tokenizer.get_score(i) - yield text, score - - def added_tokens(self) -> Iterable[Tuple[bytes, float]]: + for i in range(tokenizer.vocab_size()): + piece = tokenizer.id_to_piece(i) + text: bytes = piece.encode("utf-8") + score: float = tokenizer.get_score(i) + + toktype = gguf.TokenType.NORMAL + if tokenizer.is_unknown(i): + toktype = gguf.TokenType.UNKNOWN + if tokenizer.is_control(i): + toktype = gguf.TokenType.CONTROL + + # NOTE: I think added_tokens are user defined. + # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto + # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED + + if tokenizer.is_unused(i): + toktype = gguf.TokenType.UNUSED + if tokenizer.is_byte(i): + toktype = gguf.TokenType.BYTE + + yield text, score, toktype + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: score = -1000.0 - yield text.encode("utf-8"), score + yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED - def all_tokens(self) -> Iterable[Tuple[bytes, float]]: + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.sentencepiece_tokens() yield from self.added_tokens() def __repr__(self) -> str: return f"" +Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' -class GGMLVocab: - def __init__(self, tokens: List[Tuple[bytes, float]]): - self.tokens = tokens - self.vocab_size = len(tokens) +# +# data loading +# TODO: reuse (probably move to gguf.py?) +# - def all_tokens(self) -> Iterable[Tuple[bytes, float]]: - return self.tokens - - def __repr__(self) -> str: - return f"" - - -Vocab = Union[SentencePieceVocab, GGMLVocab] - - -def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head +def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: + #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) + if n_head_kv is not None and n_head != n_head_kv: + n_head //= n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape)) -def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray: - # First reinterpret each row from a list of int32s containing 8 values each - # to a list of uint8s containing 2 values each. - qvalues_pack8 = qvalues_pack32.view(np.uint8) - - # Then split out the two values per int8 (which requires an actual - # conversion because numpy doesn't natively support int4s). - qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8) - qvalues[:, 0::2] = qvalues_pack8 & 0xf - qvalues[:, 1::2] = qvalues_pack8 >> 4 - - assert addends is None or addends.shape == scales.shape - assert qvalues.shape[0] == scales.shape[0] - assert qvalues.shape[1] % scales.shape[1] == 0 - if g_idx is None: - repeat_count = qvalues.shape[1] // scales.shape[1] - scales = scales[:, :, np.newaxis] - if addends is not None: - addends = addends[:, :, np.newaxis] - # Reshape so that the below computation broadcasts over scales and addends: - qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count)) - else: - # In this case the scale and addend is selected for each column by g_idx: - assert addends is not None - scales = scales[:, g_idx] - addends = addends[:, g_idx] - if addends is None: - # Q4_0 - qvalues = qvalues.view(np.int8) - qvalues -= 8 - # And do the actual 'value = scale * qvalue + addend' computation. - values = scales * qvalues - if addends is not None: - values += addends - if g_idx is None: - values.shape = (values.shape[0], values.shape[1] * values.shape[2]) - return values - - class Tensor(metaclass=ABCMeta): data_type: DataType @abstractmethod - def astype(self, data_type: DataType) -> 'Tensor': ... + def astype(self, data_type: DataType) -> Tensor: ... @abstractmethod - def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ... + def permute(self, n_head: int, n_head_kv: int) -> Tensor: ... @abstractmethod - def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ... + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ... @abstractmethod - def part(self, n_part: int) -> 'UnquantizedTensor': ... + def part(self, n_part: int) -> UnquantizedTensor: ... @abstractmethod - def to_ggml(self) -> 'GGMLCompatibleTensor': ... + def to_ggml(self) -> GGMLCompatibleTensor: ... -def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray: +def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}" fp32_arr = bf16_arr.astype(np.uint32) << 16 return fp32_arr.view(np.float32) @@ -397,27 +473,27 @@ class UnquantizedTensor(Tensor): self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] def astype(self, data_type: DataType) -> Tensor: - dtype = DATA_TYPE_TO_NUMPY[data_type] + dtype = data_type.dtype if self.data_type == DT_BF16: self.ndarray = bf16_to_fp32(self.ndarray) return UnquantizedTensor(self.ndarray.astype(dtype)) - def to_ggml(self) -> 'UnquantizedTensor': + def to_ggml(self) -> UnquantizedTensor: return self - def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head)) + return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - def part(self, n_part: int) -> 'UnquantizedTensor': + def part(self, n_part: int) -> UnquantizedTensor: r = self.ndarray.shape[0] // 3 return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) - def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor': - return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head)) + def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor: + return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv)) -def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray: +def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray: tensor = lazy_tensor.load() assert isinstance(tensor, UnquantizedTensor) @@ -433,196 +509,24 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv return tensor.ndarray -class GGMLQuantizedTensor(Tensor): - data_type: QuantizedDataType - - def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None: - rows, columns = shape - assert data_type in (DT_Q4_1, DT_Q4_0) # for now - assert isinstance(data_type, QuantizedDataType) # redundant, but mypy complains without this - assert columns % data_type.groupsize == 0 - words_in_block = 6 if data_type == DT_Q4_1 else 5 - self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block)) - self.shape = shape[:] - self.data_type = data_type - - def astype(self, data_type: DataType) -> Tensor: - if data_type == self.data_type: - return self - scales = self.ndarray[:, :, 0].view(np.float32) - if self.data_type.have_addends: - addends = self.ndarray[:, :, 1].view(np.float32) - else: - addends = None - qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8]) - - dq = dequantize_q4(qweights, scales, addends, g_idx=None) - return UnquantizedTensor(dq).astype(data_type) - - def to_ggml(self) -> 'GGMLQuantizedTensor': - return self - - def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor': - return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type) - - def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': - r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head)) - - def part(self, n_part: int) -> 'UnquantizedTensor': - r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) - -GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor] - - -class DeferredPermutedTensor(Tensor): - def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None: - self.base = base - self.n_head = n_head - self.n_kv_head = n_kv_head - self.data_type = self.base.data_type - - def astype(self, data_type: DataType) -> Tensor: - return self.base.astype(data_type).permute(self.n_head, self.n_kv_head) - - def to_ggml(self) -> GGMLCompatibleTensor: - return self.base.to_ggml().permute(self.n_head, self.n_kv_head) - - def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor: - raise Exception("shouldn't permute twice") - - -class GPTQForLLaMaQuantizedTensor(Tensor): - def __init__(self, model: 'LazyModel', namebase: str) -> None: - qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32) - scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True) - - bias = model.get(f"{namebase}.bias") - if bias is not None: - # Q4_1 does not support bias; good thing the bias is always all zeros. - assert not np.any(load_unquantized(bias)) - - if f"{namebase}.zeros" in model: - zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32) - else: - qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32) - assert qzeros.dtype == np.int32 - zeros = dequantize_q4(qzeros, scales, scales, g_idx=None) - assert zeros.dtype == np.float32 - - assert zeros.shape == scales.shape - - # Output is transposed compared to the input, and addends have their sign flipped. - # Scales and zeros similarly must be transposed but only for newer - # versions of GPTQ-for-LLaMa; the older versions can be identified by - # having shape (n_embd, 1). - qweight = qweight.T - if scales.shape[1] != 1: - scales = scales.T - zeros = zeros.T - - # Output also has signs flipped for the addends. - self.qweight = qweight - self.scales = scales - self.addends = -zeros - - self.g_idx: Optional[NDArray] - if f"{namebase}.g_idx" in model: - self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32) - assert self.g_idx.shape == (qweight.shape[1] * 8,) - else: - self.g_idx = None - - self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8] - self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True, - have_g_idx=(self.g_idx is not None)) - - def inspect(self, row: int, col: int) -> None: - '''For debugging.''' - qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf - if self.g_idx is not None: - group = self.g_idx[col] - else: - group = int(col // self.groupsize()) - scale = self.scales[row, group] - addend = self.addends[row, group] - with np.printoptions(precision=None, suppress=True): - print(f'scale:{scale} addend:{addend} qweight:{qweight}') - print('possible values:', np.arange(16) * scale + addend) - print('actual value:', qweight * scale + addend) - - def astype(self, data_type: DataType) -> Tensor: - if isinstance(data_type, QuantizedDataType): - assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False - return self.regroup(data_type.groupsize) - - dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx) - return UnquantizedTensor(dequantized).astype(data_type) - - def groupsize(self) -> int: - assert self.addends.shape == self.scales.shape - assert self.shape[1] % self.scales.shape[1] == 0 - return self.shape[1] // self.scales.shape[1] - - def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor': - # Old versions of GPTQ-for-LLaMa shared scales and addends between all the - # columns in a row. Newer versions share them between every set of N - # columns in a row, where N is the `groupsize` parameter, usually 128. The - # output format shares them between every set of 32 columns. To handle - # this, duplicate scales and addends for every smaller group. - # (In the above, 'row' and 'column' are in the sense of the output.) - assert self.g_idx is None - old_groupsize = self.groupsize() - assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize - ret = copy.copy(self) - ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1) - ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1) - ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False) - return ret - - def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor: - return DeferredPermutedTensor(self, n_head, n_kv_head) - - def to_ggml(self) -> GGMLQuantizedTensor: - # The output format looks like this: - # For each row: - # For each group of 32 columns: - # - addend (float32, 4 bytes) - # - scale (float32, 4 bytes) - # - weights (int4 * 32, 16 bytes) - - if self.groupsize() != 32: - raise Exception("should have been regrouped before converting to ggml") - - # Since the output format is mixed between integers and floats, we have - # to hackily view the floats as int32s just so numpy will let us - # concatenate them. - addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis] - scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis] - - # Split into groups of 4 columns (i.e. 32 columns of quantized data): - grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4]) - - # And concatenate: - grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no') - - return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1) +GGMLCompatibleTensor = UnquantizedTensor @dataclass class LazyTensor: _load: Callable[[], Tensor] - shape: List[int] + shape: list[int] data_type: DataType description: str def load(self) -> Tensor: ret = self._load() - assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description) + # Should be okay if it maps to the same numpy type? + assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ + (self.data_type, ret.data_type, self.description) return ret - def astype(self, data_type: DataType) -> 'LazyTensor': + def astype(self, data_type: DataType) -> LazyTensor: self.validate_conversion_to(data_type) def load() -> Tensor: @@ -630,39 +534,28 @@ class LazyTensor: return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') def validate_conversion_to(self, data_type: DataType) -> None: - if data_type == self.data_type: - return - if isinstance(data_type, QuantizedDataType): - if not isinstance(self.data_type, QuantizedDataType): - raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})") - if self.data_type.have_g_idx: - sys.stderr.write( - "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), " - "which is not yet natively supported by GGML. " - "For now you can still convert this model by passing `--outtype f16` to dequantize, " - "but that will result in a much larger output file for no quality benefit.\n") - sys.exit(1) - assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends + if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions: + raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') -LazyModel = Dict[str, LazyTensor] +LazyModel: TypeAlias = 'dict[str, LazyTensor]' @dataclass class ModelPlus: model: LazyModel - paths: List[Path] # Where this was read from. - format: Literal['ggml', 'torch', 'safetensors'] - vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab. + paths: list[Path] # Where this was read from. + format: Literal['ggml', 'torch', 'safetensors', 'none'] + vocab: Vocab | None # For GGML models (which have vocab built in), the vocab. -def merge_sharded(models: List[LazyModel]) -> LazyModel: +def merge_sharded(models: list[LazyModel]) -> LazyModel: # Original LLaMA models have each file contain one part of each tensor. # Use a dict instead of a set to preserve order. names = {name: None for model in models for name in model} def convert(name: str) -> LazyTensor: - lazy_tensors: List[LazyTensor] = [model[name] for model in models] + lazy_tensors: list[LazyTensor] = [model[name] for model in models] if len(lazy_tensors) == 1: # only one file; don't go through this procedure since there might # be quantized tensors @@ -690,7 +583,7 @@ def merge_sharded(models: List[LazyModel]) -> LazyModel: return {name: convert(name) for name in names} -def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus: +def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: formats = set(mp.format for mp in models_plus) assert len(formats) == 1, "different formats?" format = formats.pop() @@ -713,17 +606,17 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus: return ModelPlus(model, paths, format, vocab) -def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor: +def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: def load() -> Tensor: - return lazy_tensor.load().permute(n_head, n_kv_head) - return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description) + return lazy_tensor.load().permute(n_head, n_head_kv) + return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) -def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor: +def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: def load() -> Tensor: - return lazy_tensor.load().permute_part(n_part, n_head) + return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) s = lazy_tensor.shape.copy() s[0] = s[0] // 3 - return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) + return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def load() -> Tensor: @@ -732,66 +625,6 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: s[0] = s[0] // 3 return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description) -def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel: - out: LazyModel = {} - out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] - out["norm.weight"] = model["model.norm.weight"] - out["output.weight"] = model["lm_head.weight"] - - for i in itertools.count(): - if f"model.layers.{i}.self_attn.q_proj.weight" in model: - out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head) - out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head) - out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] - elif f"model.layers.{i}.self_attn.W_pack.weight" in model: - out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head) - out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head) - out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) - else: - break - - out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] - - out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"] - out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"] - out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"] - - out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"] - out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"] - return out - - -def handle_quantization(model: LazyModel) -> LazyModel: - '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc. - (which resolve to UnquantizedTensors with the raw data) to one with entries - for 'foo.weight' (which resolve to QuantizedTensors). - ''' - def convert(name: str) -> Tuple[str, LazyTensor]: - if name.endswith(".qweight"): - namebase = name.rsplit('.', 1)[0] - orig_name = namebase + ".weight" - - lazy_tensor = model[name] - assert len(lazy_tensor.shape) == 2 - real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8] - - # Calculate type. This replicates the logic in - # GPTQForLLaMaQuantizedTensor (which is executed when the modelis - # actually loaded). - lazy_scales = model[f"{namebase}.scales"] - scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0] - assert real_shape[1] % scales_width == 0 - groupsize = real_shape[1] // scales_width - have_g_idx = f"{namebase}.g_idx" in model - data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx) - - def load() -> Tensor: - return GPTQForLLaMaQuantizedTensor(model, namebase) - - return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]')) - else: - return (name, model[name]) - return dict(convert(name) for name in model) # Functionality that simulates `torch.load` but where individual tensors are # only loaded into memory on demand, not all at once. @@ -824,13 +657,11 @@ class LazyUnpickler(pickle.Unpickler): assert isinstance(pid[1], LazyStorageKind) data_type = pid[1].data_type filename_stem = pid[2] - filename = self.data_base_path + '/' + filename_stem + filename = f'{self.data_base_path}/{filename_stem}' info = self.zip_file.getinfo(filename) def load(offset: int, elm_count: int) -> NDArray: - dtype = DATA_TYPE_TO_NUMPY.get(data_type) - if dtype is None: - raise Exception("tensor stored in unsupported format") + dtype = data_type.dtype fp = self.zip_file.open(info) fp.seek(offset * dtype.itemsize) size = elm_count * dtype.itemsize @@ -840,9 +671,8 @@ class LazyUnpickler(pickle.Unpickler): description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' return LazyStorage(load=load, kind=pid[1], description=description) - # @staticmethod + @staticmethod def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, - # pyright: ignore[reportSelfClsParameterName] requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: assert isinstance(storage, LazyStorage) @@ -852,13 +682,15 @@ class LazyUnpickler(pickle.Unpickler): description = f'pickled storage_offset={storage_offset} in {storage.description}' return LazyTensor(load, list(size), storage.kind.data_type, description) - # @staticmethod + @staticmethod def rebuild_from_type_v2(func, new_type, args, state): return func(*args) - CLASSES: Dict[Any, Any] = { - ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2, - ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2, + CLASSES: dict[tuple[str, str], Any] = { + # getattr used here as a workaround for mypy not being smart enough to detrmine + # the staticmethods have a __func__ attribute. + ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), + ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), @@ -885,25 +717,17 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) -SAFETENSORS_DATA_TYPES: Dict[str, DataType] = { - 'BF16': DT_BF16, - 'F16': DT_F16, - 'F32': DT_F32, - 'I32': DT_I32, -} - - def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: header_size, = struct.unpack(' LazyTensor: + def convert(info: dict[str, Any]) -> LazyTensor: data_type = SAFETENSORS_DATA_TYPES[info['dtype']] - numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] - shape: List[int] = info['shape'] + numpy_dtype = data_type.dtype + shape: list[int] = info['shape'] begin, end = info['data_offsets'] assert 0 <= begin <= end <= len(byte_buf) assert end - begin == math.prod(shape) * numpy_dtype.itemsize @@ -924,84 +748,6 @@ def must_read(fp: IO[bytes], length: int) -> bytes: return ret -def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus: - magic = must_read(fp, 4)[::-1] - if magic in (b'ggmf', b'ggjt'): - version, = struct.unpack("i", must_read(fp, 4)) - assert version == 1 - else: - assert magic == b'ggml' - version = None - n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28)) - - tokens: List[Tuple[bytes, float]] = [] - for i in range(n_vocab): - if i == 32000: - # HACK: GPT4All messed with the format without changing the magic - # number. Specifically, they changed the vocab section to contain - # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the - # extra pad token). Try to detect if we're reading a file like - # this. - orig_pos = fp.tell() - fp.seek(20, io.SEEK_CUR) - is_gpt4all = fp.read(21) == b'tok_embeddings.weight' - fp.seek(orig_pos) - if is_gpt4all: - break - - length, = struct.unpack("i", must_read(fp, 4)) - text = must_read(fp, length) - if magic != b'ggml': - score, = struct.unpack("f", must_read(fp, 4)) - tokens.append((text, score)) - vocab = GGMLVocab(tokens) if magic != b'ggml' else None - - model: LazyModel = {} - # Use mmap for the actual data to avoid race conditions with the file offset. - off = fp.raw.tell() - mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) - fp.raw.seek(off) # needed on Windows - - def read_tensor() -> None: # this is a function so that variables captured in `load` don't change - shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12)) - assert 0 <= shape_len <= 3 - shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len))) - shape = shape[::-1] - name = must_read(fp, name_len).decode('utf-8') - data_type = FTYPE_TO_DATA_TYPE[ftype] - - if magic == b'ggjt': - fp.seek((fp.tell() + 31) & -32) - - if data_type == DT_Q4_1: - # See GPTQForLLaMaQuantizedTensor.ggml_ndarray() - size = 24 * (shape[1] // 32) * shape[0] - elif data_type == DT_Q4_0: - size = 20 * (shape[1] // 32) * shape[0] - else: - numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] - elm_count = math.prod(shape) - size = elm_count * numpy_dtype.itemsize - offset = fp.tell() - buf = mapped[offset:offset+size] - fp.seek(size, io.SEEK_CUR) - - def load() -> Tensor: - if isinstance(data_type, QuantizedDataType): - ndarray = np.frombuffer(buf, dtype=np.uint32) - return GGMLQuantizedTensor(ndarray, shape, data_type) - else: - return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape)) - description = f'ggml offset={offset} type={data_type} path={path}' - model[name] = LazyTensor(load, shape, data_type, description) - - while fp.read(1) != b'': - fp.seek(-1, io.SEEK_CUR) - read_tensor() - - return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab) - - @functools.lru_cache(maxsize=None) def lazy_load_file(path: Path) -> ModelPlus: fp = open(path, 'rb') @@ -1010,9 +756,6 @@ def lazy_load_file(path: Path) -> ModelPlus: if first8[:2] == b'PK': # A zip file, i.e. PyTorch format return lazy_load_torch_file(fp, path) - elif first8[2:4] == b'gg': - # GGML format - return lazy_load_ggml_file(fp, path) elif struct.unpack(' ModelPlus: In = TypeVar('In') Out = TypeVar('Out') - -def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]: +def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than letting results pile up in memory. Specifically, there is a max of one output value buffered per thread.''' - with concurrent.futures.ThreadPoolExecutor() as executor: - futures: List[concurrent.futures.Future[Out]] = [] - items_rev = list(iterable)[::-1] - for i in range(min(concurrency, len(items_rev))): - futures.append(executor.submit(func, items_rev.pop())) + if concurrency < 2: + yield from map(func, iterable) + # Not reached. + iterable = iter(iterable) + executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor] + if use_processpool_executor: + executor_class = ProcessPoolExecutor + else: + executor_class = ThreadPoolExecutor + with executor_class(max_workers = max_workers) as executor: + futures: list[concurrent.futures.Future[Out]] = [] + done = False + for _ in range(concurrency): + try: + futures.append(executor.submit(func, next(iterable))) + except StopIteration: + done = True + break + while futures: result = futures.pop(0).result() - if items_rev: - futures.append(executor.submit(func, items_rev.pop())) + while not done and len(futures) < concurrency: + try: + futures.append(executor.submit(func, next(iterable))) + except StopIteration: + done = True + break yield result - def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: - # GGMLVocab comes from the same file as the model so shouldn't mismatch: - assert isinstance(vocab, SentencePieceVocab) + assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) if params.n_vocab == vocab.vocab_size_base: print("Ignoring added_tokens.json since model matches vocab size without it.") vocab.added_tokens_list = [] @@ -1061,105 +819,200 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None: class OutputFile: def __init__(self, fname_out: Path) -> None: - self.fout = open(fname_out, "wb") - - def write_file_header(self, params: Params, file_type: GGMLFileType) -> None: - self.fout.write(b"ggjt"[::-1]) # magic - values = [ - 1, # file version - params.n_vocab, - params.n_embd, - params.n_mult, - params.n_head, - params.n_layer, - params.n_embd // params.n_head, # rot (obsolete) - file_type.value, - ] - self.fout.write(struct.pack("i" * len(values), *values)) - - def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None: - sname = name.encode('utf-8') - self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type])) - self.fout.write(struct.pack("i" * len(shape), *shape[::-1])) - self.fout.write(sname) - self.fout.seek((self.fout.tell() + 31) & -32) - - def write_vocab(self, vocab: Vocab) -> None: - for text, score in vocab.all_tokens(): - self.fout.write(struct.pack("i", len(text))) - self.fout.write(text) - self.fout.write(struct.pack("f", score)) + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + + def add_meta_arch(self, params: Params) -> None: + name = "LLaMA" + + # TODO: better logic to determine model name + if params.n_ctx == 4096: + name = "LLaMA v2" + elif params.path_model is not None: + name = str(params.path_model.parent).split('/')[-1] + + self.gguf.add_name (name) + self.gguf.add_context_length (params.n_ctx) + self.gguf.add_embedding_length (params.n_embd) + self.gguf.add_block_count (params.n_layer) + self.gguf.add_feed_forward_length (params.n_ff) + self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) + self.gguf.add_head_count (params.n_head) + self.gguf.add_head_count_kv (params.n_head_kv) + self.gguf.add_layer_norm_rms_eps (params.f_norm_eps) + + if params.f_rope_freq_base is not None: + self.gguf.add_rope_freq_base(params.f_rope_freq_base) + + if params.f_rope_scale is not None: + self.gguf.add_rope_scale_linear(params.f_rope_scale) + + if params.ftype is not None: + self.gguf.add_file_type(params.ftype) + + def add_meta_vocab(self, vocab: Vocab) -> None: + tokens = [] + scores = [] + toktypes = [] + # NOTE: `all_tokens` returns the base vocabulary and added tokens + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + if isinstance(vocab, SentencePieceVocab): + self.gguf.add_tokenizer_model("llama") + elif isinstance(vocab, BpeVocab): + self.gguf.add_tokenizer_model("gpt2") + else: + raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab') + self.gguf.add_token_list(tokens) + self.gguf.add_token_scores(scores) + self.gguf.add_token_types(toktypes) + + def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None: + svocab.add_to_gguf(self.gguf) + + def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: + n_elements = int(np.prod(tensor.shape)) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype + data_nbytes = tensor.data_type.elements_to_bytes(n_elements) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype) + + def write_meta(self) -> None: + self.gguf.write_header_to_file() + self.gguf.write_kv_data_to_file() + + def write_tensor_info(self) -> None: + self.gguf.write_ti_data_to_file() + + def close(self) -> None: + self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: - of = OutputFile(fname_out) - params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0) + def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None: + check_vocab_size(params, vocab) + of = OutputFile(fname_out) - of.write_file_header(params, file_type=GGMLFileType.AllF32) - of.write_vocab(vocab) - of.fout.close() + + # meta data + of.add_meta_arch(params) + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) + + of.write_meta() + + of.close() + + @staticmethod + def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]: + name, lazy_tensor = item + tensor = lazy_tensor.load().to_ggml() + return (lazy_tensor.data_type, tensor.ndarray) @staticmethod - def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None: + def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: + dt, arr = item + if not isinstance(dt, QuantizedDataType): + return arr + return dt.quantize(arr) + + @staticmethod + def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: check_vocab_size(params, vocab) + of = OutputFile(fname_out) - of.write_file_header(params, file_type) - print("Writing vocab...") - of.write_vocab(vocab) - def do_item(item: Tuple[str, LazyTensor]) -> NDArray: - name, lazy_tensor = item - return lazy_tensor.load().to_ggml().ndarray + # meta data + of.add_meta_arch(params) + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) + + # tensor info + for name, lazy_tensor in model.items(): + of.add_tensor_info(name, lazy_tensor) + + of.write_meta() + of.write_tensor_info() + + # tensor data + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) + if ftype == GGMLFileType.MostlyQ8_0: + ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True) + else: + ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) - ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) + start = time.time() for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + elapsed = time.time() - start size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}") - of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type) - ndarray.tofile(of.fout) - of.fout.close() + print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}") + of.gguf.write_tensor_data(ndarray) + of.close() -def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType: - wq_type = model["layers.0.attention.wq.weight"].data_type - if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): +def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: + wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type + + if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): return GGMLFileType.AllF32 - if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): + if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): return GGMLFileType.MostlyF16 - if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and - wq_type.have_addends): - if isinstance(model["output.weight"].data_type, QuantizedDataType): - return GGMLFileType.MostlyQ4_1 - else: - return GGMLFileType.PerLayerIsQ4_1 - if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)): - return GGMLFileType.MostlyQ4_0 + if output_type_str == "q8_0": + return GGMLFileType.MostlyQ8_0 + name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} + raise Exception(f"Unexpected combination of types: {name_to_type}") +def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: + return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) + for (name, tensor) in model.items()} -def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel: - model = handle_quantization(model) +def convert_model_names(model: LazyModel, params: Params) -> LazyModel: + tmap = gguf.TensorNameMap(ARCH, params.n_layer) + should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) - if "lm_head.weight" in model: - model = convert_transformers_to_orig(model, params) - model = filter_and_sort_tensors(model) + tmp = model - return model + # HF models permut or pack some of the tensors, so we need to undo that + for i in itertools.count(): + if f"model.layers.{i}.self_attn.q_proj.weight" in model: + print(f"Permuting layer {i}") + tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) + #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + elif f"model.layers.{i}.self_attn.W_pack.weight" in model: + print(f"Unpacking and permuting layer {i}") + tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) + tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) + del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] + else: + break + out: LazyModel = {} + for name, lazy_tensor in model.items(): + tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) + if name_new is None: + raise Exception(f"Unexpected tensor name: {name}") -def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: - return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) - for (name, tensor) in model.items()} + if tensor_type in should_skip: + print(f"skipping tensor {name_new}") + continue + print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") + out[name_new] = lazy_tensor -def nth_multifile_path(path: Path, n: int) -> Optional[Path]: + return out + +def nth_multifile_path(path: Path, n: int) -> Path | None: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. ''' # Support the following patterns: - patterns: List[Tuple[str, str]] = [ + patterns: list[tuple[str, str]] = [ # - x.00.pth, x.01.pth, etc. (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. @@ -1175,11 +1028,11 @@ def nth_multifile_path(path: Path, n: int) -> Optional[Path]: return None -def find_multifile_paths(path: Path) -> List[Path]: +def find_multifile_paths(path: Path) -> list[Path]: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the whole list of paths in the model. ''' - ret: List[Path] = [] + ret: list[Path] = [] for i in itertools.count(): nth_path = nth_multifile_path(path, i) if nth_path is None: @@ -1203,11 +1056,6 @@ def load_some_model(path: Path) -> ModelPlus: # Try the PyTorch patterns too, with lower priority globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] files = [file for glob in globs for file in path.glob(glob)] - if not files: - # Try GGML too, but with lower priority, since if both a non-GGML - # model and a GGML model exist in the same directory, we assume the - # latter was converted from the former. - files = list(path.glob("ggml-model*.bin*")) if not files: raise Exception(f"Can't find model in directory {path}") if len(files) > 1: @@ -1215,7 +1063,7 @@ def load_some_model(path: Path) -> ModelPlus: path = files[0] paths = find_multifile_paths(path) - models_plus: List[ModelPlus] = [] + models_plus: list[ModelPlus] = [] for path in paths: print(f"Loading model file {path}") models_plus.append(lazy_load_file(path)) @@ -1224,19 +1072,14 @@ def load_some_model(path: Path) -> ModelPlus: return model_plus -def filter_and_sort_tensors(model: LazyModel) -> LazyModel: - return {name: model[name] for name in TENSORS_LIST if name in model} - - -def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab: - print(f"vocabtype: {vocabtype}") +def load_vocab(path: Path, vocabtype: str | None) -> Vocab: # Be extra-friendly and accept either a file or a directory. Also, if it's # a directory, it might be the model directory, and tokenizer.model might # be in the parent of that. if path.is_dir(): vocab_file = "tokenizer.model" if vocabtype == 'bpe': - vocab_file = "vocab.json" + vocab_file = "vocab.json" path2 = path / vocab_file # Use `.parent` instead of /.. to handle the symlink case better. path3 = path.parent / vocab_file @@ -1246,23 +1089,27 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab: path = path3 else: raise FileNotFoundError( - f"Could not find tokenizer.model in {path} or its parent; " + f"Could not find {vocab_file} in {path} or its parent; " "if it's in another directory, pass the directory as --vocab-dir") + + print(f"Loading vocab file '{path}', type '{vocabtype}'") + added_tokens_path = path.parent / "added_tokens.json" - print(f"Loading vocab file {path}") - return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None, - vocabtype) + if vocabtype == "bpe": + return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None) + elif vocabtype == "spm": + return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) + else: + raise ValueError(f"Unsupported vocabulary type {vocabtype}") -def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: +def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: namestr = { - GGMLFileType.AllF32: "f32", + GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", - GGMLFileType.MostlyQ4_0: "q4_0", - GGMLFileType.MostlyQ4_1: "q4_1", - GGMLFileType.PerLayerIsQ4_1: "q4_1", + GGMLFileType.MostlyQ8_0:"q8_0", }[file_type] - ret = model_paths[0].parent / f"ggml-model-{namestr}.bin" + ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: sys.stderr.write( f"Error: Default output path ({ret}) would overwrite the input. " @@ -1279,47 +1126,82 @@ def do_dump_model(model_plus: ModelPlus) -> None: print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") -def main(args_in: Optional[List[str]] = None) -> None: +def main(args_in: list[str] | None = None) -> None: parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") - parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") - parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)") - parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, - help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") - parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)") + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") + parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) args = parser.parse_args(args_in) - vocab: Vocab if args.dump_single: model_plus = lazy_load_file(args.model) do_dump_model(model_plus) - elif args.vocab_only: - vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) + return + + if not args.vocab_only: + model_plus = load_some_model(args.model) + else: + model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) + + if args.dump: + do_dump_model(model_plus) + return + + params = Params.load(model_plus) + if params.n_ctx == -1: + if args.ctx is None: + raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n" + "Please specify one with --ctx:\n" + " - LLaMA v1: --ctx 2048\n" + " - LLaMA v2: --ctx 4096\n") + params.n_ctx = args.ctx + + if args.outtype: + params.ftype = { + "f32": GGMLFileType.AllF32, + "f16": GGMLFileType.MostlyF16, + "q8_0": GGMLFileType.MostlyQ8_0, + }[args.outtype] + + print(f"params = {params}") + + vocab: Vocab + if args.vocab_only: assert args.outfile, "need --outfile if using --vocab-only" + # FIXME: Try to respect vocab_dir somehow? + vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) + special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') outfile = args.outfile - OutputFile.write_vocab_only(outfile, vocab) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) print(f"Wrote {outfile}") + return + + if model_plus.vocab is not None and args.vocab_dir is None: + vocab = model_plus.vocab else: - model_plus = load_some_model(args.model) - if args.dump: - do_dump_model(model_plus) - return - if model_plus.vocab is not None and args.vocab_dir is None: - vocab = model_plus.vocab - else: - vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent - vocab = load_vocab(vocab_dir, args.vocabtype) - params = Params.load(model_plus) - model = model_plus.model - model = do_necessary_conversions(model, params) - output_type = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, output_type) - outfile = args.outfile or default_outfile(model_plus.paths, output_type) - OutputFile.write_all(outfile, params, output_type, model, vocab) - print(f"Wrote {outfile}") + vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent + vocab = load_vocab(vocab_dir, args.vocabtype) + # FIXME: Try to respect vocab_dir somehow? + special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') + + model = model_plus.model + model = convert_model_names(model, params) + ftype = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, ftype) + outfile = args.outfile or default_outfile(model_plus.paths, ftype) + + params.ftype = ftype + print(f"Writing {outfile}, format {ftype}") + + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency) + print(f"Wrote {outfile}") if __name__ == '__main__': diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md index 69ba6173c0c26b18a5cc6e915f369a090eeefecc..c9acff7d4f18cced9adf5623f99f228c514dc380 100644 --- a/docs/token_generation_performance_tips.md +++ b/docs/token_generation_performance_tips.md @@ -3,7 +3,7 @@ ## Verifying that the model is running on the GPU with cuBLAS Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell -./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some " +./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: @@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM) CPU: 7 physical cores RAM: 32GB -Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML) +Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML) -Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` +Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` Result: diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a7b26776ad355e10c32de98c29b497d8e68e0a85..884c4276422ebf44db08737d210aaf599296d9f2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -6,27 +6,6 @@ find_package(Threads REQUIRED) # ... -# common - -set(TARGET common) - -add_library(${TARGET} OBJECT - common.h - common.cpp - console.h - console.cpp - grammar-parser.h - grammar-parser.cpp - ) - -if (BUILD_SHARED_LIBS) - set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) -endif() - -target_include_directories(${TARGET} PUBLIC .) -target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE llama) - # examples include_directories(${CMAKE_CURRENT_SOURCE_DIR}) @@ -42,8 +21,12 @@ else() add_subdirectory(benchmark) add_subdirectory(baby-llama) add_subdirectory(train-text-from-scratch) + add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(simple) + add_subdirectory(speculative) add_subdirectory(embd-input) + add_subdirectory(llama-bench) + add_subdirectory(beam-search) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 6fa55b3194676ca0d126842859b96ba846daba07..ed61125eaa4da4fd6340b52235ae968b7fb2e3e9 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -9,12 +9,12 @@ #endif #ifdef LLAMA_DEFAULT_RMS_EPS -static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; +constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; #else -static const float rms_norm_eps = 5e-6f; +constexpr float rms_norm_eps = 5e-6f; #endif -float frand() { +static float frand() { return (float)rand()/(float)RAND_MAX; } @@ -25,19 +25,21 @@ struct random_normal_distribution { float max; }; -void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { +static void init_random_normal_distribution( + struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max +) { rnd->gen = std::mt19937(seed); rnd->nd = std::normal_distribution{mean, std}; rnd->min = min; rnd->max = max; } -float frand_normal(struct random_normal_distribution * rnd) { +static float frand_normal(struct random_normal_distribution * rnd) { const float r = rnd->nd(rnd->gen); return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); } -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); if (plan.work_size > 0) { @@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, ggml_graph_compute(graph, &plan); } -struct ggml_tensor * randomize_tensor( - struct ggml_tensor * tensor, - int ndims, - const int64_t ne[], - float fmin, - float fmax) { - +static struct ggml_tensor * randomize_tensor( + struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax +) { switch (ndims) { case 1: for (int i0 = 0; i0 < ne[0]; i0++) { @@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor( return tensor; } -struct ggml_tensor * randomize_tensor_normal( - struct ggml_tensor * tensor, - int ndims, - const int64_t ne[], - struct random_normal_distribution * rnd) { +static struct ggml_tensor * randomize_tensor_normal( + struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd +) { float scale = 1.0; // xavier switch (ndims) { case 1: @@ -159,7 +155,7 @@ struct llama_hparams { } }; -uint32_t get_n_ff(const struct llama_hparams* hparams) { +static uint32_t get_n_ff(const struct llama_hparams* hparams) { const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; return n_ff; } @@ -260,7 +256,7 @@ struct llama_model_lora { std::vector layers; }; -void init_model(struct llama_model * model) { +static void init_model(struct llama_model * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -297,7 +293,7 @@ void init_model(struct llama_model * model) { } -void init_model_lora(struct llama_model_lora * model) { +static void init_model_lora(struct llama_model_lora * model) { const auto & hparams = model->hparams; const uint32_t n_embd = hparams.n_embd; @@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) { } } -void set_param_model(struct llama_model * model) { +static void set_param_model(struct llama_model * model) { const auto& hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) { } } -void set_param_model_lora(struct llama_model_lora * model) { +static void set_param_model_lora(struct llama_model_lora * model) { const auto& hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) { } } -void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { +static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std } -void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) { +static void randomize_model_lora( + struct llama_model_lora * model, int seed, float mean, float std, float min, float max +) { const auto & hparams = model->hparams; const uint32_t n_layer = hparams.n_layer; @@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, } } -bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { +static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int return true; } -bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { +static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * return true; } -struct ggml_tensor * forward( - struct llama_model * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - +static struct ggml_tensor * forward( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -756,25 +754,25 @@ struct ggml_tensor * forward( return inpL; } -void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { +static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); } -void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { +static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { GGML_ASSERT(tensor->n_dims == 2); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); } -void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { +static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { GGML_ASSERT(tensor->n_dims == 3); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); } -void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { +static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { GGML_ASSERT(tensor->n_dims == 4); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); @@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6 GGML_ASSERT(tensor->ne[3] == ne3); } -struct ggml_tensor * forward_batch( - struct llama_model * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past, - const int n_batch) { - +static struct ggml_tensor * forward_batch( + struct llama_model * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past, + const int n_batch +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch( return inpL; } - -struct ggml_tensor * forward_lora( - struct llama_model_lora * model, - struct llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - +static struct ggml_tensor * forward_lora( + struct llama_model_lora * model, + struct llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past +) { const int N = n_tokens; struct llama_kv_cache& kv_self = *cache; @@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora( return inpL; } -void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { +static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { assert(logits->n_dims == 2); assert(probs->n_dims == 2); assert(best_samples->n_dims == 1); @@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str } } -void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { +static void sample_softmax_batch( + struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, + struct ggml_tensor * best_samples +) { GGML_ASSERT(best_samples->n_dims == 2); GGML_ASSERT(logits->n_dims == 3); GGML_ASSERT(probs->n_dims == 3); @@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits } } -void print_row(struct ggml_tensor * probs, int i) { +static void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); printf(" %.2f", p); @@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) { printf("\n"); } -void print_matrix(struct ggml_tensor * probs) { +static void print_matrix(struct ggml_tensor * probs) { assert(probs->n_dims == 2); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { @@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) { } } -void print_token(int token, int n_vocab) { +static void print_token(int token, int n_vocab) { for (int k = 0; k < token; ++k) { printf(" "); } @@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) { printf("\n"); } -void print_tokens(struct ggml_tensor * tokens, int n_vocab) { +static void print_tokens(struct ggml_tensor * tokens, int n_vocab) { for (int i=0; ine[0]; ++i) { int token = ggml_get_i32_1d(tokens, i); print_token(token, n_vocab); } } -void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { int n_tokens = tokens_input->ne[0]; int n_vocab = targets->ne[0]; float randomness = 0.0f; @@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru } } -void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +static void get_example_targets_batch( + struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets +) { GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT( targets->n_dims == 3); int n_tokens = tokens_input->ne[0]; @@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct } } -void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { +static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { int n_tokens = tokens_input->ne[0]; int n_vocab = targets->ne[0]; for (int i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#include +#endif + +// Used for debugging to print out beam tokens. +struct ostream_beam_view { + llama_context * ctx; + llama_beam_view beam_view; +}; + +static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) { + os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; + for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { + os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); + } + return os << ')'; +} + +// Put here anything you want back in beam_search_callback(). +struct beam_search_callback_data { + llama_context * ctx; + std::vector response; +}; + +// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. +// For example, eob can be flagged due to maximum token length, stop words, etc. +static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { + return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); +} + +// Function matching type llama_beam_search_callback_fn_t. +// Custom callback example is called each time the beams lengths increase: +// * Show progress by printing ',' following by number of convergent beam tokens if any. +// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. +// This is also called when the stop condition is met. +// Collect tokens into std::vector response which is pointed to by callback_data. +static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { + auto& callback_data = *static_cast(callback_data_ptr); + // Mark beams as EOS as needed. + for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { + llama_beam_view& beam_view = beams_state.beam_views[i]; + if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) { + beam_view.eob = true; + } + } + printf(","); // Show progress + if (const size_t n = beams_state.common_prefix_length) { + callback_data.response.resize(callback_data.response.size() + n); + assert(0u < beams_state.n_beams); + const llama_token * tokens = beams_state.beam_views[0].tokens; + std::copy(tokens, tokens + n, callback_data.response.end() - n); + printf("%zu", n); + } + fflush(stdout); +#if 1 // DEBUG: print current beams for this iteration + std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n"; + for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { + std::cout << "beams["< 3 ) + { + params.prompt = argv[3]; + } + + if ( params.prompt.empty() ) + { + params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n"; + } + + //--------------------------------- + // Init LLM : + //--------------------------------- + + llama_backend_init(params.numa); + + llama_model * model; + llama_context * ctx; + + std::tie(model, ctx) = llama_init_from_gpt_params( params ); + + if ( model == NULL ) + { + fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); + return 1; + } + + //--------------------------------- + // Tokenize the prompt : + //--------------------------------- + + std::vector tokens_list = llama_tokenize(ctx, params.prompt, true); + + const size_t max_context_size = llama_n_ctx( ctx ); + const size_t max_tokens_list_size = max_context_size - 4 ; + + if (tokens_list.size() > max_tokens_list_size) + { + fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" , + __func__ , tokens_list.size() , max_tokens_list_size ); + return 1; + } + + fprintf( stderr, "\n\n" ); + + // Print the tokens from the prompt : + + for( auto id : tokens_list ) + { + std::cout << llama_token_to_piece(ctx, id); + } + std::cout << std::flush; + + int n_past = llama_get_kv_cache_token_count(ctx); + if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads)) + { + fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); + return 1; + } + n_past += tokens_list.size(); + + beam_search_callback_data callback_data{ctx, {}}; + size_t const beam_width = static_cast(params.n_beams); + int const n_predict = 256; + llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads); + + std::cout << "\n\n"; + for (llama_token const token_id : callback_data.response) { + std::cout << llama_token_to_piece(ctx,token_id); + } + std::cout << std::endl; + + llama_free( ctx ); + llama_free_model( model ); + + llama_backend_free(); + + return 0; +} diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 3f3415350919c99058ace62fa61ebc20418e4c6f..14916d83134633ae95c9293070376f3182d827b5 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,7 +1,8 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index f7215f43bb31ce88c8d16977a8ae30a9a08930aa..f1c382aa9b9557a2636b8ca8d6703cc27c03d362 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,5 +1,6 @@ -#include "ggml.h" #include "build-info.h" +#include "common.h" +#include "ggml.h" #include #include @@ -20,7 +21,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); if (plan.work_size > 0) { @@ -31,19 +32,19 @@ void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, ggml_graph_compute(graph, &plan); } -float tensor_sum_elements(const ggml_tensor * tensor) { - float sum = 0; - if (tensor->type==GGML_TYPE_F32) { +static float tensor_sum_elements(const ggml_tensor * tensor) { + double sum = 0; + if (tensor->type == GGML_TYPE_F32) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; + sum += ((float *) tensor->data)[j*tensor->ne[0] + k]; } } } return sum; } -void tensor_dump(const ggml_tensor * tensor, const char * name) { +static void tensor_dump(const ggml_tensor * tensor, const char * name) { printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); @@ -58,7 +59,7 @@ struct benchmark_params_struct { int32_t n_iterations = 10; }; -void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { +static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); @@ -99,7 +100,7 @@ int main(int argc, char ** argv) { exit(1); } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); printf("Starting Test\n"); // create the ggml context @@ -125,12 +126,15 @@ int main(int argc, char ** argv) { //printf("Memsize required = %i\n", sizex*sizex); + // TODO: perform the bench for all types or for a user specified type + const ggml_type qtype = GGML_TYPE_Q4_1; + size_t ctx_size = 0; ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); + ctx_size += sizex*sizey*ggml_type_sizef(qtype); + ctx_size += sizex*sizey*ggml_type_sizef(qtype); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += 1024*1024*16; @@ -163,7 +167,7 @@ int main(int argc, char ** argv) { struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); ggml_set_f32(m2, 2.0f); - printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); + printf("\n------ Test 1 - Matrix Mult via F32 code\n"); // printf("Creating new tensor m11xm2\n"); struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); @@ -181,17 +185,16 @@ int main(int argc, char ** argv) { TENSOR_DUMP(gf.nodes[0]); - printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); + printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); int32_t nelements = sizex*sizey; - int32_t ne[2] = { sizex, sizey }; std::vector hist_cur(1 << 4, 0); // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); - struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); - ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); + struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data()); // Set up a the compute graph // printf("Creating new tensor q31\n"); @@ -202,8 +205,8 @@ int main(int argc, char ** argv) { // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); - struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); - ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); + struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data()); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); @@ -220,7 +223,7 @@ int main(int argc, char ** argv) { printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); - // Let's use the F32 result from above as a reference for the q4_0 multiplication + // Let's use the F32 result from above as a reference for the quantized multiplication float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); @@ -250,7 +253,7 @@ int main(int argc, char ** argv) { // Check that the matrix multiplication result is in the right ballpark // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]); - float delta = abs(sum_of_Q4_result - sum_of_F32_reference); + float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 if (delta > allowed_delta) { diff --git a/examples/chat.sh b/examples/chat.sh index 9a928ef05431a847efa476433b19d24ca88c15ca..d567acecdff111aa66bffc9476e09ebc1a2fd5c9 100644 --- a/examples/chat.sh +++ b/examples/chat.sh @@ -11,6 +11,6 @@ cd .. # # "--keep 48" is based on the contents of prompts/chat-with-bob.txt # -./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \ +./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \ --repeat_penalty 1.0 --color -i \ -r "User:" -f prompts/chat-with-bob.txt diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e262d44f9849609a9b617ccad01dacc1c53c8d66 --- /dev/null +++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET convert-llama2c-to-ggml) +add_executable(${TARGET} convert-llama2c-to-ggml.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0f37d295bd9ee440fe7135f725d260084c3a34b0 --- /dev/null +++ b/examples/convert-llama2c-to-ggml/README.md @@ -0,0 +1,26 @@ +## Convert llama2.c model to ggml + +This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default. + +To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository: + +`$ make -j` + +After successful compilation, following usage options are available: +``` +usage: ./convert-llama2c-to-ggml [options] + +options: + -h, --help show this help message and exit + --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf') + --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model + --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') +``` + +An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: + +`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` + +Now you can use the model with a command like: + +`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c291f0adf20e18bbb09232b0388dd16f1574ce90 --- /dev/null +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -0,0 +1,963 @@ +#include "ggml.h" +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// GGUF keys & tensor names. + +#define KV_GENERAL_ARCHITECTURE "general.architecture" +#define KV_GENERAL_NAME "general.name" + +#define KV_TOKENIZER_MODEL "tokenizer.ggml.model" +#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens" +#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type" +#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores" +#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id" +#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id" +#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id" +#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id" +#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id" +#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json" + +#define KV_CONTEXT_LENGTH "llama.context_length" +#define KV_EMBEDDING_LENGTH "llama.embedding_length" +#define KV_BLOCK_COUNT "llama.block_count" +#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length" +#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count" +#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv" +#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon" +#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count" + +#define TN_TOKEN_EMBD "token_embd.weight" +#define TN_OUTPUT_NORM "output_norm.weight" +#define TN_OUTPUT "output.weight" +#define TN_ATTN_NORM "blk.%d.attn_norm.weight" +#define TN_ATTN_Q "blk.%d.attn_q.weight" +#define TN_ATTN_K "blk.%d.attn_k.weight" +#define TN_ATTN_V "blk.%d.attn_v.weight" +#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" +#define TN_FFN_NORM "blk.%d.ffn_norm.weight" +#define TN_FFN_GATE "blk.%d.ffn_gate.weight" +#define TN_FFN_DOWN "blk.%d.ffn_down.weight" +#define TN_FFN_UP "blk.%d.ffn_up.weight" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +#define LLAMA_FILE_VERSION_GGJT_V3 3 + +#define TOKENIZER_NAME "llama" +#define UNKNOWN_TOKEN_ID 0 +#define BOS_TOKEN_ID 1 +#define EOS_TOKEN_ID 2 + +//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. +typedef struct { + int dim; // transformer dimension + int hidden_dim; // for ffn layers + int n_layers; // number of layers + int n_heads; // number of query heads + int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery) + int vocab_size; // vocabulary size, usually 256 (byte-level) + int seq_len; // max sequence length +} Config; + +struct TransformerWeights { + // token embedding table + float* token_embedding_table; // (vocab_size, dim) + // weights for rmsnorms + float* rms_att_weight; // (layer, dim) rmsnorm weights + float* rms_ffn_weight; // (layer, dim) + // weights for matmuls + float* wq; // (layer, dim, dim) + float* wk; // (layer, dim, dim) + float* wv; // (layer, dim, dim) + float* wo; // (layer, dim, dim) + // weights for ffn + float* w1; // (layer, hidden_dim, dim) + float* w2; // (layer, dim, hidden_dim) + float* w3; // (layer, hidden_dim, dim) + // final rmsnorm + float* rms_final_weight; // (dim,) + // freq_cis for RoPE relatively positional embeddings + // float* freq_cis_real; // (seq_len, dim/2) + // float* freq_cis_imag; // (seq_len, dim/2) + // (optional) classifier weights for the logits, on the last layer + float* wcls; + + ~TransformerWeights() { + delete[] token_embedding_table; + delete[] rms_att_weight; + delete[] rms_ffn_weight; + delete[] wq; + delete[] wk; + delete[] wv; + delete[] wo; + delete[] w1; + delete[] w2; + delete[] w3; + delete[] rms_final_weight; + delete[] wcls; + } +}; + +static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { + // we calloc instead of malloc to keep valgrind happy + w->token_embedding_table = new float[p->vocab_size * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + + w->rms_att_weight = new float[p->n_layers * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); + + w->rms_ffn_weight = new float[p->n_layers * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); + + w->wq = new float[p->n_layers * p->dim * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + + w->wk = new float[p->n_layers * p->dim * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + + w->wv = new float[p->n_layers * p->dim * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + + w->wo = new float[p->n_layers * p->dim * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + + w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + + w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); + + w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + + w->rms_final_weight = new float[p->dim](); + printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); + + if (shared_weights) { + w->wcls = NULL; + } else { + w->wcls = new float[p->vocab_size * p->dim](); + printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + } +} + +static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { + if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; + if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; + if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; + if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; + if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; + if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; + if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; + if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1; + if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast(p->n_layers * p->hidden_dim * p->dim)) return 1; + if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1; + if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast(p->dim)) return 1; + + // Skip freq_cis_real & freq_cis_imag + int head_size = p->dim / p->n_heads; + fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR); + + if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; + + // Check we didn't forget to read anything + auto curr = ftell(f); + fseek(f, 0, SEEK_END); + auto end = ftell(f); + if (curr != end) { + printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end); + return 1; + } + + return 0; +} + +static void print_sample_weights(TransformerWeights *w){ + printf("----- Quick print of first of the weight vales of all the variables\n"); + printf("%f\n", w->token_embedding_table[0]); + printf("%f\n", w->rms_att_weight[0]); + printf("%f\n", w->rms_ffn_weight[0]); + + printf("%f\n", w->wq[0]); + printf("%f\n", w->wk[0]); + printf("%f\n", w->wv[0]); + printf("%f\n", w->wo[0]); + printf("%f\n", w->w1[0]); + printf("%f\n", w->w2[0]); + printf("%f\n", w->w3[0]); + printf("%f\n", w->rms_att_weight[0]); + if (w->wcls) printf("%f\n", w->wcls[0]); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model. + +struct llama_vocab { + using id = int32_t; + using token = std::string; + using ttype = llama_token_type; + + struct token_data { + token text; + float score; + ttype type; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; +}; + +struct my_llama_hparams { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_ff = 11008; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + bool operator!=(const my_llama_hparams& other) const { + return memcmp(this, &other, sizeof(my_llama_hparams)); + } +}; + +struct my_llama_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + +struct my_llama_model { + struct ggml_context * ctx = NULL; + + std::string name; + + my_llama_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; + + uint32_t train_its = 0; + uint32_t train_samples = 0; + uint32_t train_tokens = 0; +}; + +struct train_params { + const char * fn_vocab_model; + const char * fn_llama2c_model; + const char * fn_llama2c_output_model; + const char * fn_train_data; + const char * fn_checkpoint_in; + const char * fn_checkpoint_out; + const char * fn_model_out; + + uint32_t seed; + + int n_ctx; + int n_embd; + int n_mult; + int n_head; + int n_layer; + int n_rotmax; + + int n_threads; + int n_batch; + int n_examples; + int n_predict; + + int print_info_interval; + int print_details_interval; + + bool samples_start_after_nl; + bool use_adam; + bool use_flash; + bool use_scratch; + + // only adam + int warmup; + int cos_decay_steps; + float cos_decay_restart; + float cos_decay_alpha; + + int lbfgs_n_iter; + int adam_n_iter; + float adam_alpha; + float adam_decay; + + int mem_model_gb; + int mem_compute_gb; + int mem_compute0_gb; + int mem_compute1_gb; +}; + +static void print_params(struct my_llama_hparams * params) { + printf("%s: n_vocab: %d\n", __func__, params->n_vocab); + printf("%s: n_ctx: %d\n", __func__, params->n_ctx); + printf("%s: n_embd: %d\n", __func__, params->n_embd); + printf("%s: n_mult: %d\n", __func__, params->n_mult); + printf("%s: n_head: %d\n", __func__, params->n_head); + printf("%s: n_ff: %d\n", __func__, params->n_ff); + printf("%s: n_layer: %d\n", __func__, params->n_layer); + printf("%s: n_rot: %d\n", __func__, params->n_rot); +} + +static void init_model(struct my_llama_model * model) { + const auto & hparams = model->hparams; + + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + + const uint32_t n_ff = hparams.n_ff; + struct ggml_context * ctx = model->ctx; + + model->train_its = 0; + model->train_samples = 0; + model->train_tokens = 0; + + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); + printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); + + model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd); + + model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); + printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); + + // printing the per-layer allocations here so we dont print in the for loop. + printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); + + printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer); + + printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); + printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); + printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); + + ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); + ggml_set_name(model->norm, "norm.weight"); + ggml_set_name(model->output, "output.weight"); + + model->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); + layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + + ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str()); + + ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str()); + ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str()); + ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str()); + ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str()); + + ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str()); + + ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str()); + ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str()); + ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str()); + } +} + +static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + return *ptr; +} + +static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { + int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + return *ptr; +} + +static void print_row(struct ggml_tensor * probs, int i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = get_f32_2d(probs, k, i); + printf(" %f", p); + } + printf("\n"); +} + +static void print_matrix(struct ggml_tensor * probs) { + assert(probs->n_dims == 2); + for (int i = 0; i < probs->ne[1]; ++i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = get_f32_2d(probs, k, i); + printf(" %.2f", p); + } + printf("\n"); + } +} + +#ifdef __GNUC__ +#ifdef __MINGW32__ +__attribute__((format(gnu_printf, 1, 2))) +#else +__attribute__((format(printf, 1, 2))) +#endif +#endif +static std::string format(const char * fmt, ...) { + va_list ap, ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("fread failed: %s", strerror(errno)); + } + if (ret != 1) { + die("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + std::float_t read_f32() { + std::float_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +static bool is_ggml_file(const char * filename) { + llama_file file(filename, "rb"); + if (file.size < 4) { + return false; + } + uint32_t magic = file.read_u32(); + return magic == GGUF_MAGIC; +} + +static std::string llama_escape_whitespaces(const std::string & text) { + std::ostringstream out; + for (char c : text) { + if (c == ' ') out << "\xe2\x96\x81"; + else out << c; + } + return out.str(); +} + +static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { + if (is_ggml_file(filename)) { + struct ggml_context * ctx_data = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + }; + + struct gguf_context * ctx = gguf_init_from_file(filename, params); + GGML_ASSERT(ctx != NULL); + + const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL); + GGML_ASSERT(model_idx >= 0); + std::string tokenizer_name = gguf_get_val_str(ctx, model_idx); + GGML_ASSERT(tokenizer_name == TOKENIZER_NAME); + + const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST); + GGML_ASSERT(token_idx >= 0); + + const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES); + GGML_ASSERT(score_idx >= 0); + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE); + GGML_ASSERT(toktype_idx >= 0); + const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + + vocab->id_to_token.resize(n_vocab); + + for (uint32_t i = 0; i < n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab->token_to_id[word] = i; + + auto & token_data = vocab->id_to_token[i]; + token_data.text = std::move(word); + token_data.score = scores[i]; + token_data.type = (llama_token_type) toktypes[i]; + } + ggml_free(ctx_data); + gguf_free(ctx); + } else { + // assume llama2.c vocabulary + printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); + llama_file file(filename, "rb"); + if (!file.fp) { + die_fmt("%s: %s", strerror(errno), filename); + } + const int n_vocab = config->vocab_size; + /* uint32_t max_token_length = */ file.read_u32(); // unused + vocab->id_to_token.resize(n_vocab); + for (llama_vocab::id id=0; id", &byte_val) == 1) { + // Text of byte tokens is already in the expected format. + type = LLAMA_TOKEN_TYPE_BYTE; + } else { + type = LLAMA_TOKEN_TYPE_NORMAL; + } + text = llama_escape_whitespaces(text); + + vocab->id_to_token[id].text = text; + vocab->id_to_token[id].score = score; + vocab->id_to_token[id].type = type; + vocab->token_to_id.emplace(text, id); + } + } +} + +static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { + int ct; + switch (gg_weights->n_dims){ + case 1: + ct = 0; + for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ + float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]); + *ptr = karpathy_weights[ct]; + ct++; + } + break; + case 2: + ct = 0; + for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) { + for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) { + float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]); + *ptr = karpathy_weights[ct]; + ct++; + } + } + break; + case 3: + ct = 0; + for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) { + for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) { + for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) { + float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]); + *ptr = karpathy_weights[ct]; + ct++; + } + } + } + break; + } +} + +static void save_as_llama_model( + struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename +) { + // convert AK weights into GG weights one by one. + // w->token_embedding_table -> model->tok_embeddings + // float* -> struct ggml_tensor + convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table); + convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); + + convert_weights_ak_to_gg(model->norm, w->rms_final_weight); + //print_row(model->norm, 0); + + // for rms-att-weight + int row_length = model->hparams.n_embd; + int n_ff = model->hparams.n_ff; + + for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ + auto & layer = model->layers[i]; + // 1d + convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); + convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); + + // from 3d matrix layer x dim x dim to 2d matrix dim x dim + convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]); + + convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]); + convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]); + convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]); + } + + struct gguf_context * ctx = gguf_init_empty(); + + std::vector tokens; + std::vector scores; + std::vector token_types; + for (const llama_vocab::token_data & token_data : vocab->id_to_token) { + tokens.push_back(token_data.text.c_str()); + scores.push_back(token_data.score); + token_types.push_back(token_data.type); + } + gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size()); + gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size()); + gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size()); + + gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME); + + gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama"); + gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama"); + + // special tokens + gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); + gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); + + gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); + gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); + gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); + // n_head_kv is optional, default to n_head + // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...); + gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer); + gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot); + gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); + + // write tensors + ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD); + gguf_add_tensor(ctx, model->tok_embeddings); + + ggml_set_name(model->norm, TN_OUTPUT_NORM); + gguf_add_tensor(ctx, model->norm); + + ggml_set_name(model->output, TN_OUTPUT); + gguf_add_tensor(ctx, model->output); + + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_format_name(layer.wq, TN_ATTN_Q, i); + gguf_add_tensor(ctx, layer.wq); + + ggml_format_name(layer.wk, TN_ATTN_K, i); + gguf_add_tensor(ctx, layer.wk); + + ggml_format_name(layer.wv, TN_ATTN_V, i); + gguf_add_tensor(ctx, layer.wv); + + ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i); + gguf_add_tensor(ctx, layer.wo); + + ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i); + gguf_add_tensor(ctx, layer.attention_norm); + + ggml_format_name(layer.w1, TN_FFN_GATE, i); + gguf_add_tensor(ctx, layer.w1); + + ggml_format_name(layer.w2, TN_FFN_DOWN, i); + gguf_add_tensor(ctx, layer.w2); + + ggml_format_name(layer.w3, TN_FFN_UP, i); + gguf_add_tensor(ctx, layer.w3); + + ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i); + gguf_add_tensor(ctx, layer.ffn_norm); + } + + gguf_write_to_file(ctx, filename, false); + gguf_free(ctx); +} + +static struct train_params get_default_train_params() { + struct train_params params; + params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; + params.fn_llama2c_output_model = "ak_llama_model.bin"; + params.fn_train_data = "shakespeare.txt"; + params.fn_checkpoint_in = "checkpoint.bin"; + params.fn_checkpoint_out = "checkpoint.bin"; + params.fn_model_out = "ggml-checkpoint-f32.bin"; + + params.seed = -1; + + params.n_ctx = 128; + params.n_embd = 256; + params.n_mult = 256; + params.n_head = 8; + params.n_layer = 16; + params.n_rotmax = 64; + + params.n_threads = 6; + params.n_batch = 8; + params.n_examples = 8; + params.n_predict = 1024; + + params.print_info_interval = 1; + params.print_details_interval = 2; + + params.samples_start_after_nl = false; + params.use_adam = true; + params.use_flash = true; + params.use_scratch = true; + + // only adam + params.warmup = 100; + params.cos_decay_steps = 1000; + params.cos_decay_restart = 1.1f; + params.cos_decay_alpha = 0.0f; + + params.lbfgs_n_iter = 16; + params.adam_n_iter = 16; + params.adam_alpha = 1e-3f; + params.adam_decay = 1e-3f; + + params.mem_model_gb = 2; + params.mem_compute_gb = 24; + params.mem_compute0_gb = 8; + params.mem_compute1_gb = 2; + + return params; +} + +static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model); + fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); + fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); + fprintf(stderr, "\n"); +} + +static bool params_parse(int argc, char ** argv, struct train_params * params) { + bool invalid_param = false; + bool reqd_param_found = false; + std::string arg; + struct train_params default_params = get_default_train_params(); + const std::string arg_prefix = "--"; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (arg == "--copy-vocab-from-model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_vocab_model = argv[i]; + } else if (arg == "--llama2c-model") { + if (++i >= argc) { + invalid_param = true; + break; + } + reqd_param_found = true; + params->fn_llama2c_model = argv[i]; + } else if (arg == "--llama2c-output-model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_llama2c_output_model = argv[i]; + } else if (arg == "-h" || arg == "--help") { + print_usage(argc, argv, &default_params); + exit(0); + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + print_usage(argc, argv, &default_params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + print_usage(argc, argv, &default_params); + exit(1); + } + if (!reqd_param_found){ + fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n"); + print_usage(argc, argv, &default_params); + exit(1); + } + + return true; +} + +static std::string basename(const std::string &path) { + size_t pos = path.find_last_of("/\\"); + if (pos == std::string::npos) { + return path; + } + return path.substr(pos + 1); +} + +int main(int argc, char ** argv) { + struct train_params params = get_default_train_params(); + if (!params_parse(argc, argv, ¶ms)) { + return 1; + } + Config config; + TransformerWeights weights = {}; + { + FILE *file = fopen(params.fn_llama2c_model, "rb"); + if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } + // read in the config header + if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; } + auto shared_weights = config.vocab_size > 0; + config.vocab_size = abs(config.vocab_size); + + // read in the Transformer weights + malloc_weights(&weights, &config, shared_weights); + if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; } + fclose(file); + } + + struct llama_vocab vocab; + load_vocab(params.fn_vocab_model, &config, &vocab); + + struct my_llama_model model; + model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); + model.hparams.n_ctx = params.n_ctx; + model.hparams.n_embd = config.dim; //params.n_embd; + model.hparams.n_ff = config.hidden_dim; + model.hparams.n_mult = 32;//params.n_mult; + model.hparams.n_head = config.n_heads; //params.n_head; + model.hparams.n_layer = config.n_layers; //params.n_layer; + model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + print_params(&model.hparams); + struct ggml_init_params lcparams; + lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); + lcparams.mem_buffer = NULL; + lcparams.no_alloc = false; + + model.ctx = ggml_init(lcparams); + + init_model(&model); + model.name = basename(params.fn_llama2c_model); + save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); + + printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); + + ggml_free(model.ctx); + return 0; +} diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 2185b9b0e2839ff652fc8d26f7057a1f897b7257..c995eef3514a051621e21f67c5da9d6eefc3e321 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -1,8 +1,5 @@ -// Defines sigaction on msys: -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - +#include "build-info.h" +#include "common.h" #include "embd-input.h" #include @@ -23,11 +20,11 @@ extern "C" { struct MyModel* create_mymodel(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return nullptr; } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = uint32_t(time(NULL)); @@ -167,7 +164,7 @@ llama_token sampling_id(struct MyModel* mymodel) { llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; // TODO: Apply penalties - // float nl_logit = logits[llama_token_nl()]; + // float nl_logit = logits[llama_token_nl(ctx)]; // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); // llama_sample_repetition_penalty(ctx, &candidates_p, // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, @@ -176,7 +173,7 @@ llama_token sampling_id(struct MyModel* mymodel) { // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, // last_n_repeat, alpha_frequency, alpha_presence); // if (!penalize_nl) { - // logits[llama_token_nl()] = nl_logit; + // logits[llama_token_nl(ctx)] = nl_logit; // } if (temp <= 0) { @@ -211,10 +208,10 @@ const char * sampling(struct MyModel * mymodel) { llama_context * ctx = mymodel->ctx; int id = sampling_id(mymodel); static std::string ret; - if (id == llama_token_eos()) { + if (id == llama_token_eos(ctx)) { ret = ""; } else { - ret = llama_token_to_str(ctx, id); + ret = llama_token_to_piece(ctx, id); } eval_id(mymodel, id); return ret.c_str(); diff --git a/examples/embd-input/embd-input.h b/examples/embd-input/embd-input.h index efb5ba5e2af4543685c824337b6b8ae18b018572..eff5e3b84e1e1dad8016d32a20312cab9eeeea9a 100644 --- a/examples/embd-input/embd-input.h +++ b/examples/embd-input/embd-input.h @@ -3,7 +3,6 @@ #include "common.h" #include "llama.h" -#include "build-info.h" extern "C" { diff --git a/examples/embd-input/embd_input.py b/examples/embd-input/embd_input.py index be2896614e9b37604f580159e7043f1d6b66aa94..f146acdc19de7c65d77959209f35248c8f8d5130 100644 --- a/examples/embd-input/embd_input.py +++ b/examples/embd-input/embd_input.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import ctypes from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int import numpy as np diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py index bcbdd2bedfd1ae6aa1609a9ac1aa29a6c876dd04..06fad55f4980e2af925a208aedd6549e4d8b75ad 100644 --- a/examples/embd-input/llava.py +++ b/examples/embd-input/llava.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import sys import os sys.path.insert(0, os.path.dirname(__file__)) diff --git a/examples/embd-input/minigpt4.py b/examples/embd-input/minigpt4.py index 15c9b77c0d37c73316528404360ef3cea72218e5..7b13e4a5cc4f8e4c61a5d48fcdffb3871c487fc2 100644 --- a/examples/embd-input/minigpt4.py +++ b/examples/embd-input/minigpt4.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import sys import os sys.path.insert(0, os.path.dirname(__file__)) diff --git a/examples/embd-input/panda_gpt.py b/examples/embd-input/panda_gpt.py index 0cfac5f32adf2669c262751d9a6edcc060ce4fb1..891ad7cc9ffbd9c285c7abe849b73aa9daa73276 100644 --- a/examples/embd-input/panda_gpt.py +++ b/examples/embd-input/panda_gpt.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import sys import os sys.path.insert(0, os.path.dirname(__file__)) diff --git a/examples/embedding/README.md b/examples/embedding/README.md index fe8f5dcc62ed9e6e09202a7c6d4464679dff9be5..6929454c5e549ef5848f2c0023bd4b357bfdd8b3 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -1,3 +1,21 @@ -# embedding +# llama.cpp/example/embedding -TODO +This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp. + +## Quick Start + +To get started right away, run the following command, making sure to use the correct path for the model you have: + +### Unix-based systems (Linux, macOS, etc.): + +```bash +./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null +``` + +### Windows: + +```powershell +embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null +``` + +The above command will output space-separated float values. diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 5192d6df5c2f8f057010eb87ff58049618990c39..27d605f4e13d69f62432ac1be795fded50d6ff81 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,6 +1,6 @@ +#include "build-info.h" #include "common.h" #include "llama.h" -#include "build-info.h" #include @@ -11,18 +11,13 @@ int main(int argc, char ** argv) { gpt_params params; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } params.embedding = true; - if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" - "expect poor results\n", __func__, params.n_ctx); - } - - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); @@ -47,6 +42,12 @@ int main(int argc, char ** argv) { return 1; } + const int n_ctx_train = llama_n_ctx_train(ctx); + if (params.n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } + // print system information { fprintf(stderr, "\n"); @@ -56,9 +57,6 @@ int main(int argc, char ** argv) { int n_past = 0; - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); - // tokenize the prompt auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); @@ -67,27 +65,34 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "\n"); } - if (params.embedding){ - if (embd_inp.size() > 0) { - if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return 1; - } + if (embd_inp.size() > (size_t)params.n_ctx) { + fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", + __func__, embd_inp.size(), params.n_ctx); + return 1; + } + + while (!embd_inp.empty()) { + int n_tokens = std::min(params.n_batch, (int) embd_inp.size()); + if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; } + n_past += n_tokens; + embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens); + } - const int n_embd = llama_n_embd(ctx); - const auto embeddings = llama_get_embeddings(ctx); + const int n_embd = llama_n_embd(ctx); + const auto embeddings = llama_get_embeddings(ctx); - for (int i = 0; i < n_embd; i++) { - printf("%f ", embeddings[i]); - } - printf("\n"); + for (int i = 0; i < n_embd; i++) { + printf("%f ", embeddings[i]); } + printf("\n"); llama_print_timings(ctx); llama_free(ctx); diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d1806af3ebfcf3f98054020c73ec70b65cf1633 --- /dev/null +++ b/examples/gguf/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gguf) +add_executable(${TARGET} gguf.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9ab63a29310ad99c019d01afb9fe15d15aea3ebe --- /dev/null +++ b/examples/gguf/gguf.cpp @@ -0,0 +1,249 @@ +#include "ggml.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include + +#undef MIN +#undef MAX +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +template +static std::string to_string(const T & val) { + std::stringstream ss; + ss << val; + return ss.str(); +} + +static bool gguf_ex_write(const std::string & fname) { + struct gguf_context * ctx = gguf_init_empty(); + + gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12); + gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13); + gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234); + gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235); + gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678); + gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679); + gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f); + gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull); + gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll); + gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789); + gguf_set_val_bool(ctx, "some.parameter.bool", true); + gguf_set_val_str (ctx, "some.parameter.string", "hello world"); + + gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector{ 1, 2, 3, 4, }.data(), 4); + gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector{ 3.145f, 2.718f, 1.414f, }.data(), 3); + gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector{ "hello", "world", "!" }.data(), 3); + + struct ggml_init_params params = { + /*.mem_size =*/ 128ull*1024ull*1024ull, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx_data = ggml_init(params); + + const int n_tensors = 10; + + // tensor infos + for (int i = 0; i < n_tensors; ++i) { + const std::string name = "tensor_" + to_string(i); + + int64_t ne[GGML_MAX_DIMS] = { 1 }; + int32_t n_dims = rand() % GGML_MAX_DIMS + 1; + + for (int j = 0; j < n_dims; ++j) { + ne[j] = rand() % 10 + 1; + } + + struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne); + ggml_set_name(cur, name.c_str()); + + { + float * data = (float *) cur->data; + for (int j = 0; j < ggml_nelements(cur); ++j) { + data[j] = 100 + i; + } + } + + gguf_add_tensor(ctx, cur); + } + + gguf_write_to_file(ctx, fname.c_str(), false); + + printf("%s: wrote file '%s;\n", __func__, fname.c_str()); + + ggml_free(ctx_data); + gguf_free(ctx); + + return true; +} + +// just read tensor info +static bool gguf_ex_read_0(const std::string & fname) { + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ NULL, + }; + + struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); + + printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); + printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); + + // kv + { + const int n_kv = gguf_get_n_kv(ctx); + + printf("%s: n_kv: %d\n", __func__, n_kv); + + for (int i = 0; i < n_kv; ++i) { + const char * key = gguf_get_key(ctx, i); + + printf("%s: kv[%d]: key = %s\n", __func__, i, key); + } + } + + // find kv string + { + const char * findkey = "some.parameter.string"; + + const int keyidx = gguf_find_key(ctx, findkey); + if (keyidx == -1) { + printf("%s: find key: %s not found.\n", __func__, findkey); + } else { + const char * key_value = gguf_get_val_str(ctx, keyidx); + printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value); + } + } + + // tensor info + { + const int n_tensors = gguf_get_n_tensors(ctx); + + printf("%s: n_tensors: %d\n", __func__, n_tensors); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name (ctx, i); + const size_t offset = gguf_get_tensor_offset(ctx, i); + + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + } + } + + gguf_free(ctx); + + return true; +} + +// read and create ggml_context containing the tensors and their data +static bool gguf_ex_read_1(const std::string & fname) { + struct ggml_context * ctx_data = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + }; + + struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); + + printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); + printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); + + // kv + { + const int n_kv = gguf_get_n_kv(ctx); + + printf("%s: n_kv: %d\n", __func__, n_kv); + + for (int i = 0; i < n_kv; ++i) { + const char * key = gguf_get_key(ctx, i); + + printf("%s: kv[%d]: key = %s\n", __func__, i, key); + } + } + + // tensor info + { + const int n_tensors = gguf_get_n_tensors(ctx); + + printf("%s: n_tensors: %d\n", __func__, n_tensors); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name (ctx, i); + const size_t offset = gguf_get_tensor_offset(ctx, i); + + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + } + } + + // data + { + const int n_tensors = gguf_get_n_tensors(ctx); + + for (int i = 0; i < n_tensors; ++i) { + printf("%s: reading tensor %d data\n", __func__, i); + + const char * name = gguf_get_tensor_name(ctx, i); + + struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + + printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); + + // print first 10 elements + const float * data = (const float *) cur->data; + + printf("%s data[:10] : ", name); + for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { + printf("%f ", data[j]); + } + printf("\n\n"); + + // check data + { + const float * data = (const float *) cur->data; + for (int j = 0; j < ggml_nelements(cur); ++j) { + if (data[j] != 100 + i) { + fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); + return false; + } + } + } + } + } + + printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); + + ggml_free(ctx_data); + gguf_free(ctx); + + return true; +} + +int main(int argc, char ** argv) { + if (argc < 3) { + printf("usage: %s data.gguf r|w\n", argv[0]); + return -1; + } + + const std::string fname(argv[1]); + const std::string mode (argv[2]); + + GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w"); + + if (mode == "w") { + GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file"); + } else if (mode == "r") { + GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); + GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); + } + + return 0; +} diff --git a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9d433f4b1acf01019344e66ce9eea59e7ed7d299 --- /dev/null +++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp @@ -0,0 +1,1133 @@ +#ifndef CMPNCT_GPT2BPE +#define CMPNCT_GPT2BPE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +// Unicode GPT2 Byte Pair Encoding Tokenizer +// Adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] +// Removed loading of merges from HF json and parts made for a specific vocab + + +//----------------- +// Unicode library (from cmpnct_unicode.cpp) +//----------------- + +// Minimal library for high performance handling and categorization of UTF8 strings and characters +// Using std::string + +enum CNCTCharType { + DIGIT, // a numerical char in any language + LETTER, // a letter in any language + WHITESPACE, // any form of whitespace + ACCENT_MARK, // letter modifiers like ´ in é + PUNCTUATION, // punctuation including brackets + SYMBOL, // math, currency, other symbols + CONTROL, // control characters + MIXED, // a mix of the above + UNIDENTIFIED // something more exotic like emoji or separators +}; + +struct CNCTUnicode; + +struct CNCTString { + std::string str; + size_t utf8_chars; + + CNCTCharType char_type=UNIDENTIFIED; + bool is_sequential=false; + + size_t seq_offset_bytes=0; + size_t seq_offset_utf8_chars=0; + + bool operator==(const std::string &other) const; + bool operator==(const char other) const; + bool operator==(const CNCTString &other) const; + CNCTString &operator+=(const std::string &other); + CNCTString &operator+=(const char other); + friend CNCTString operator+(CNCTString lhs, const std::string &rhs); + friend CNCTString operator+(CNCTString lhs, const char rhs); + CNCTString& operator+=(const CNCTString& other); + friend CNCTString operator+(CNCTString lhs, const CNCTString& rhs); +}; + +struct CNCTUnicode { + static bool check_code_range(int c, const std::vector>& ranges); + static CNCTCharType get_code_type(int c); + static CNCTCharType get_code_type(const std::string &utf8_char); + static int utf8_len(const char c); + static int strlen_utf8(std::string src); + static std::vector split_utf8(const std::string &src); + static std::vector split_utf8_enhanced(const std::string &src); + static CNCTCharType string_identify(const std::string& str); + static bool string_test(const std::string& str, CNCTCharType chartype); +}; + +static const std::vector> digit_ranges = { +{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, +{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F}, +{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468}, +{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909}, +{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A}, +{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739}, +{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9}, +{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9}, +}; + +static const std::vector> letter_ranges = { +{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374}, +{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559}, +{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710}, +{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A}, +{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2}, +{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33}, +{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD}, +{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61}, +{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0}, +{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3}, +{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61}, +{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A}, +{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C}, +{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7}, +{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5}, +{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C}, +{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3}, +{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB}, +{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F}, +{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D}, +{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4}, +{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107}, +{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E}, +{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F}, +{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006}, +{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF}, +{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788}, +{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE}, +{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B}, +{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4}, +{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB}, +{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44}, +{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7}, +{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA}, +{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D}, +{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835}, +{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7}, +{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35}, +{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C}, +{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147}, +{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288}, +{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339}, +{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE}, +{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909}, +{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00}, +{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F}, +{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0}, +{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77}, +{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08}, +{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C}, +{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514}, +{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA}, +{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D}, +{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27}, +{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52}, +{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72}, +{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734}, +{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A}, +}; + +static const std::vector> whitespace_ranges = { +{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}, +}; + +static const std::vector> accent_mark_ranges = { +{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4}, +{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B}, +{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7}, +{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC}, +{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63}, +{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83}, +{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57}, +{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC}, +{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E}, +{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734}, +{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E}, +{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2}, +{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F}, +{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881}, +{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D}, +{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E}, +{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F}, +{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134}, +{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303}, +{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E}, +{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938}, +{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47}, +{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45}, +{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92}, +{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36}, +{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A}, +{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF}, +}; + +static const std::vector> punctuation_ranges = { +{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB}, +{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D}, +{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76}, +{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA}, +{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A}, +{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027}, +{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998}, +{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F}, +{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF}, +{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F}, +{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20}, +{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857}, +{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D}, +{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9}, +{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946}, +{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F}, +{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F}, +}; + +static const std::vector> symbol_ranges = { +{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7}, +{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608}, +{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA}, +{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5}, +{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C}, +{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF}, +{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B}, +{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4}, +{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, +{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3}, +{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A}, +{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69}, +{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F}, +{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F}, +{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241}, +{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789}, +{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC}, +{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD}, +{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8}, +{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53}, +{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA}, +}; + +static const std::vector> control_ranges = { +{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C}, +{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F}, +{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5}, +{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29}, +{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80}, +{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5}, +{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54}, +{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7}, +{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C}, +{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB}, +{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49}, +{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7}, +{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5}, +{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC}, +{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF}, +{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F}, +{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF}, +{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F}, +{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F}, +{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F}, +{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC}, +{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF}, +{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F}, +{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF}, +{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF}, +{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F}, +{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA}, +{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA}, +{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2}, +{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1}, +{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B}, +{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F}, +{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F}, +{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807}, +{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E}, +{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E}, +{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8}, +{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF}, +{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF}, +{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E}, +{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334}, +{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C}, +{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF}, +{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936}, +{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09}, +{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B}, +{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF}, +{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F}, +{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF}, +{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163}, +{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A}, +{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8}, +{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F}, +{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A}, +{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6}, +{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26}, +{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50}, +{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B}, +{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF}, +{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F}, +{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F}, +{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F}, +{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F}, +{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF}, +}; + +//String +bool CNCTString::operator==(const std::string& other) const { + return str.compare(other) == 0; +} +bool CNCTString::operator==(const char other) const { + return str.compare(std::string(1, other)) == 0; +} +bool CNCTString::operator==(const CNCTString& other) const { + return str.compare(other.str) == 0; +} +// + operators +CNCTString& CNCTString::operator+=(const std::string& other) { + str += other; + int new_len = CNCTUnicode::strlen_utf8(other); + utf8_chars += new_len; + char_type = CNCTUnicode::string_identify(str); + seq_offset_bytes += other.size(); + seq_offset_utf8_chars += new_len; + return *this; +} + +CNCTString& CNCTString::operator+=(const char other) { + std::string str = std::string(1, other); + *this += str; + return *this; +} + +CNCTString& CNCTString::operator+=(const CNCTString& other) { + str += other.str; + utf8_chars += other.utf8_chars; + char_type = CNCTUnicode::string_identify(str); + seq_offset_bytes += other.str.size(); + seq_offset_utf8_chars += other.utf8_chars; + return *this; +} + +struct CRCompare { + bool operator()(const std::pair& p, int i) { + return p.second < i; + } + bool operator()(int i, const std::pair& p) { + return i < p.first; + } +}; + +// binary search for code range +bool CNCTUnicode::check_code_range(int c, const std::vector> &ranges) { + auto it = std::upper_bound(ranges.begin(), ranges.end(), c, CRCompare()); + if (it != ranges.begin()) { + --it; + } + return c >= it->first && c <= it->second; +} + +// these are binary searches, it takes only a few operations +CNCTCharType CNCTUnicode::get_code_type(int c) { + if (check_code_range(c, letter_ranges)) { + return LETTER; + } + if (check_code_range(c, digit_ranges)) { + return DIGIT; + } + if (check_code_range(c, whitespace_ranges)) { + return WHITESPACE; + } + if (check_code_range(c, punctuation_ranges)) { + return PUNCTUATION; + } + if (check_code_range(c, symbol_ranges)) { + return SYMBOL; + } + if (check_code_range(c, accent_mark_ranges)) { + return ACCENT_MARK; + } + if (check_code_range(c, control_ranges)) { + return CONTROL; + } + return UNIDENTIFIED; +} + +static int utf8_to_unicode(const std::string& utf8_char) { + int c = 0; + int len = (int)utf8_char.size(); + if (len == 1) { + c = utf8_char[0]; + } else if (len == 2) { + c = ((utf8_char[0] & 0x1F) << 6) | (utf8_char[1] & 0x3F); + } else if (len == 3) { + c = ((utf8_char[0] & 0x0F) << 12) | ((utf8_char[1] & 0x3F) << 6) | (utf8_char[2] & 0x3F); + } else if (len == 4) { + c = ((utf8_char[0] & 0x07) << 18) | ((utf8_char[1] & 0x3F) << 12) | ((utf8_char[2] & 0x3F) << 6) | (utf8_char[3] & 0x3F); + } + return c; +} + +CNCTCharType CNCTUnicode::get_code_type(const std::string &utf8_char) { + return get_code_type(utf8_to_unicode(utf8_char)); +} + +int CNCTUnicode::utf8_len(const char c) +{ + if ((c & 0x80) == 0) { + return 1; // ASCII character + } + if ((c & 0xE0) == 0xC0) { + return 2; // 2-byte character + } + if ((c & 0xF0) == 0xE0) { + return 3; // 3-byte character + } + if ((c & 0xF0) == 0xF0) { + return 4; // 4-byte character + } + return 1; // not valid utf8 + // static const uint8_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + // return lookup[static_cast(c) >> 4]; +} + +int CNCTUnicode::strlen_utf8(const std::string src) { + int len = 0; + for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { + int char_len = utf8_len(*it); + if (char_len > 1) { + it += char_len - 1; + } + len += 1; + } + return len; +} + +// split a string into unicode strings +std::vector CNCTUnicode::split_utf8(const std::string &src) { + std::vector result; + for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { + int char_len = utf8_len(*it); + std::string str(it, it + char_len); + result.push_back(str); + if (char_len > 1) { + it += char_len - 1; + } + } + return result; +} + +// split a string into unicode strings (CNCTString) with sequence information +std::vector CNCTUnicode::split_utf8_enhanced(const std::string &src) { + std::vector result; + int seq_offset_bytes=0; + int seq_offset_utf8_chars=0; + for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) { + int char_len = utf8_len(*it); + std::string str(it, it + char_len); + CNCTString cnct_str; + cnct_str.seq_offset_bytes = seq_offset_bytes; + cnct_str.seq_offset_utf8_chars = seq_offset_utf8_chars; + cnct_str.str = str; + cnct_str.utf8_chars = 1; + cnct_str.char_type = get_code_type(str); + #if 0 + switch (cnct_str.char_type) + { + case DIGIT: + printf("%s = DIGIT\n", str.c_str()); + break; + case LETTER: + printf("%s = LETTER\n", str.c_str()); + break; + case WHITESPACE: + printf("%s = WHITESPACE\n", str.c_str()); + break; + case PUNCTUATION: + printf("%s = PUNCTUATION\n", str.c_str()); + break; + case UNIDENTIFIED: + printf("%s = UNIDENTIFIED\n", str.c_str()); + break; + case SYMBOL: + printf("%s = SYMBOL\n", str.c_str()); + break; + case CONTROL: + printf("%s = CONTROL\n", str.c_str()); + break; + } + #endif + + result.push_back(cnct_str); + seq_offset_bytes += char_len; + seq_offset_utf8_chars += 1; + if (char_len > 1) { + it += char_len - 1; + } + + } + return result; +} + +// return the type of the string +CNCTCharType CNCTUnicode::string_identify(const std::string &str) { + CNCTCharType result = UNIDENTIFIED; + std::string::const_iterator it = str.begin(); + while (it != str.end()) { + int len = utf8_len(*it); + int c = 0; + for (int i = 0; i < len && it != str.end(); ++i, ++it) { + c = (c << 8) | static_cast(*it); + } + switch (get_code_type(c)) { + case DIGIT: + if (result == UNIDENTIFIED) { + result = DIGIT; + } else if (result != DIGIT) { + return MIXED; + } + break; + case LETTER: + if (result == UNIDENTIFIED) { + result = LETTER; + } else if (result != LETTER) { + return MIXED; + } + break; + case WHITESPACE: + if (result == UNIDENTIFIED) { + result = WHITESPACE; + } else if (result != WHITESPACE) { + return MIXED; + } + break; + case PUNCTUATION: + if (result == UNIDENTIFIED) { + result = PUNCTUATION; + } else if (result != PUNCTUATION) { + return MIXED; + } + break; + default: + return MIXED; + break; + } + } + return result; +} + +// verify the content of a string +bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype) +{ + std::string::const_iterator it = str.begin(); + while (it != str.end()) { + int len = utf8_len(*it); + int c = 0; + for (int i = 0; i < len && it != str.end(); ++i, ++it) { + c = (c << 8) | static_cast(*it); + } + if (get_code_type(c) != chartype) { + return false; + } + } + return true; +} + +//----------------- +// llama.cpp GPT2 vocab (from libfalcon.cpp) +//----------------- + +std::string replaceAll(std::string str, const std::string& from, const std::string& to) { + size_t start_pos = 0; + while((start_pos = str.find(from, start_pos)) != std::string::npos) { + str.replace(start_pos, from.length(), to); + start_pos += to.length(); // Handles case where 'to' is a substring of 'from' + } + return str; +} + +struct TrieNode { + std::map map; + int32_t Id = -1; +}; + +struct Trie { + TrieNode *root; + + Trie() : root(new TrieNode()) {} + + ~Trie() { + if(root) + deleteTrie(root); + } + + // Move constructor + Trie(Trie&& other) noexcept : root(other.root) { + other.root = nullptr; + } + + // Move assignment operator + Trie& operator=(Trie&& other) noexcept { + if (this != &other) { + if(root) + deleteTrie(root); + root = other.root; + other.root = nullptr; + } + return *this; + } + + void insert(const std::string &token, int32_t Id) { + TrieNode* current = root; + for(auto ch : token) { + if(current->map.find(ch) == current->map.end()) { + current->map[ch] = new TrieNode(); + } + current = current->map[ch]; + } + current->Id = Id; + } + + void reset() { + deleteTrie(root); + root = new TrieNode(); + } + +private: + void deleteTrie(TrieNode* node) { + for(auto &it: node->map) { + deleteTrie(it.second); + } + delete node; + } + +}; + +struct gpt2bpe_vocab { + using id = int32_t; + using token = std::string; + + std::map max_token_length; // max length, for each 2byte prefix + std::map, int> bpe_ranks; + std::vector> bpe_merges; + + id special_bos_id = -1; + id special_eos_id = -1; + id special_unk_id = -1; + id special_sep_id = -1; + id special_pad_id = -1; + + id linefeed_id = -1; + + std::unordered_map token_to_id; + std::unordered_map id_to_token; + + Trie trie; // highspeed access to tokens by prefix tree + + // populate trie from map + void populate_trie_from_map() { + trie.reset(); + for (const auto& pair : token_to_id) { + trie.insert(pair.first, pair.second); + if (pair.first.size() >= 2) { + std::string prefix = pair.first.substr(0, 2); + max_token_length[prefix] = std::max(max_token_length[prefix], (uint32_t)pair.first.size()); + } + } + } + // populate token ranks map + int populate_bpe_ranks(std::vector> bpe_merges_) { + for (int i = 0; i < (int)bpe_merges_.size(); i++) { + bpe_ranks.emplace(bpe_merges_[i], i); + } + bpe_merges = bpe_merges_; + return bpe_merges_.size(); + } + + // Trim whitespace characters from the beginning and end of the string + void trim(std::string& str) { + // Remove whitespace characters from the beginning of the string + str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](int ch) { + return !std::isspace(ch); + })); + + // Remove whitespace characters from the end of the string + str.erase(std::find_if(str.rbegin(), str.rend(), [](int ch) { + return !std::isspace(ch); + }).base(), str.end()); + } + + // get max token length available for a prefix of 2 bytes (string at least 2 bytes long) + int get_max_token_length(const std::string& string) const { + if (string.size() < 2) { + return -1; + } + std::string prefix = string.substr(0, 2); + if (max_token_length.find(prefix) == max_token_length.end()) { + return 0; + } + return max_token_length.at(prefix); + } + + // function to find if two tokens match in bpe_rank, return rank or -1 + int find_bpe_rank(const std::string& token1, const std::string& token2) const { + std::string left_token = token1; + std::string right_token = token2; + left_token = replaceAll(left_token, " ", "Ġ"); + left_token = replaceAll(left_token, "\n", "Ċ"); + right_token = replaceAll(right_token, " ", "Ġ"); + right_token = replaceAll(right_token, "\n", "Ċ"); + + auto it = bpe_ranks.find(std::make_pair(left_token, right_token)); + if (it == bpe_ranks.end()) { + return -1; + } + return it->second; + } + + std::pair find_longest_match(const std::string& snippet) const { + TrieNode* current = trie.root; + gpt2bpe_vocab::id last_matched_id = -1; + std::string last_matched_token = ""; + std::string current_token = ""; + for (auto ch : snippet) { + if (current->map.find(ch) == current->map.end()) { + break; + } + current = current->map[ch]; + current_token += ch; + if (current->Id != -1) { + last_matched_id = current->Id; + last_matched_token = current_token; + } + } + return {last_matched_id, last_matched_token}; + } + +}; + + +// +// tokenizer - bpe type, gpt2 tokenization compatible +// + +struct ggllm_bpe_symbol { + using index = int; + index prev; + index next; + const char * text; + size_t n; +}; + +static_assert(std::is_trivially_copyable::value, "ggllm_bpe_symbol is not trivially copyable"); + +struct ggllm_bpe_bigram { + struct comparator { + bool operator()(ggllm_bpe_bigram & l, ggllm_bpe_bigram & r) { + return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); + } + }; + + using queue_storage = std::vector; + using queue = std::priority_queue; + ggllm_bpe_symbol::index left; + ggllm_bpe_symbol::index right; + std::string text; + int rank; + size_t size; +}; + +struct gpt2bpe_tokenizer { + gpt2bpe_tokenizer(const gpt2bpe_vocab & vocab, bool g2ws_): vocab_(vocab) { flag_g2ws = g2ws_; } + + void tokenize(const std::string & text, std::vector & output) { + int final_prev_index = -1; + // auto start = ggml_time_us(); + auto word_collection = bpe_gpt2_preprocess(text); + // auto end = ggml_time_us(); + // fprintf(stderr, "%s: preprocessing took %0.3f ms\n", __func__, (end - start) / 1000.0); + + symbols_final.clear(); + + for (auto & word : word_collection) { + work_queue_ = ggllm_bpe_bigram::queue(); + symbols_.clear(); + + int index = 0; + size_t offset = 0; + + while (offset < word.size()) { + ggllm_bpe_symbol sym; + size_t char_len = std::min(word.size() - offset, (size_t) CNCTUnicode::utf8_len(word[offset])); + sym.text = word.c_str() + offset; + sym.n = 1; + sym.n = char_len; + offset += sym.n; + sym.prev = index - 1; + sym.next = offset == word.size() ? -1 : index + 1; + index++; + symbols_.emplace_back(sym); + } + for (size_t i = 1; i < symbols_.size(); ++i) { + add_new_bigram(i - 1, i); + } + + // build token(s) + while (!work_queue_.empty()) { + auto bigram = work_queue_.top(); + work_queue_.pop(); + + auto & left_symbol = symbols_[bigram.left]; + auto & right_symbol = symbols_[bigram.right]; + + if (left_symbol.n == 0 || right_symbol.n == 0) { + continue; + } + std::string left_token = std::string(left_symbol.text, left_symbol.n); + std::string right_token = std::string(right_symbol.text, right_symbol.n); + if (left_token + right_token != bigram.text) { + continue; // Skip this bigram if it's outdated + } + + // merge the right sym into the left one + left_symbol.n += right_symbol.n; + right_symbol.n = 0; + + // remove the right sym from the chain + left_symbol.next = right_symbol.next; + if (right_symbol.next >= 0) { + symbols_[right_symbol.next].prev = bigram.left; + } + + add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol + add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol + } + + // add the fnished tokens to the final list keeping correct order for next and prev + for (auto & sym : symbols_) { + if (sym.n > 0) { + sym.prev = final_prev_index; + sym.next = -1; + if (final_prev_index != -1) { + symbols_final[final_prev_index].next = symbols_final.size(); + } + symbols_final.emplace_back(sym); + final_prev_index = symbols_final.size() - 1; + } + } + } + + symbols_ = symbols_final; + if (symbols_.size()) + for (int i = 0; i != -1; i = symbols_[i].next) { + auto & symbol = symbols_[i]; + if (symbol.n == 0) { + continue; + } + std::string str = std::string(symbol.text, symbol.n); + std::string str_decoded = decode_token(str); + auto token = vocab_.token_to_id.find(str_decoded); + + if (token == vocab_.token_to_id.end()) { + for (auto j = str_decoded.begin(); j != str_decoded.end(); ++j) { + std::string byte_str(1, *j); + auto token_multibyte = vocab_.token_to_id.find(byte_str); + if (token_multibyte == vocab_.token_to_id.end()) { + fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); + } + output.push_back((*token_multibyte).second); + } + } else { + output.push_back((*token).second); + } + } + } + +private: + void add_new_bigram(int left, int right) { + if (left == -1 || right == -1) return; + + std::string left_token = std::string(symbols_[left].text, symbols_[left].n); + std::string right_token = std::string(symbols_[right].text, symbols_[right].n); + + int rank_found = -1; + rank_found = vocab_.find_bpe_rank(left_token, right_token); + + if (rank_found < 0) { + return; + } + + ggllm_bpe_bigram bigram; + bigram.left = left; + bigram.right = right; + bigram.rank = rank_found; + bigram.size = left_token.size() + right_token.size(); + bigram.text = left_token + right_token; + work_queue_.push(bigram); + } + + std::unordered_map bytes_to_unicode() { + static std::unordered_map hex_map = { + { 0x21, "\x21" }, { 0x22, "\x22" }, { 0x23, "\x23" }, { 0x24, "\x24" }, { 0x25, "\x25" }, { 0x26, "\x26" }, { 0x27, "\x27" }, { 0x28, "\x28" }, { 0x29, "\x29" }, { 0x2A, "\x2A" }, + { 0x2B, "\x2B" }, { 0x2C, "\x2C" }, { 0x2D, "\x2D" }, { 0x2E, "\x2E" }, { 0x2F, "\x2F" }, { 0x30, "\x30" }, { 0x31, "\x31" }, { 0x32, "\x32" }, { 0x33, "\x33" }, { 0x34, "\x34" }, + { 0x35, "\x35" }, { 0x36, "\x36" }, { 0x37, "\x37" }, { 0x38, "\x38" }, { 0x39, "\x39" }, { 0x3A, "\x3A" }, { 0x3B, "\x3B" }, { 0x3C, "\x3C" }, { 0x3D, "\x3D" }, { 0x3E, "\x3E" }, + { 0x3F, "\x3F" }, { 0x40, "\x40" }, { 0x41, "\x41" }, { 0x42, "\x42" }, { 0x43, "\x43" }, { 0x44, "\x44" }, { 0x45, "\x45" }, { 0x46, "\x46" }, { 0x47, "\x47" }, { 0x48, "\x48" }, + { 0x49, "\x49" }, { 0x4A, "\x4A" }, { 0x4B, "\x4B" }, { 0x4C, "\x4C" }, { 0x4D, "\x4D" }, { 0x4E, "\x4E" }, { 0x4F, "\x4F" }, { 0x50, "\x50" }, { 0x51, "\x51" }, { 0x52, "\x52" }, + { 0x53, "\x53" }, { 0x54, "\x54" }, { 0x55, "\x55" }, { 0x56, "\x56" }, { 0x57, "\x57" }, { 0x58, "\x58" }, { 0x59, "\x59" }, { 0x5A, "\x5A" }, { 0x5B, "\x5B" }, { 0x5C, "\x5C" }, + { 0x5D, "\x5D" }, { 0x5E, "\x5E" }, { 0x5F, "\x5F" }, { 0x60, "\x60" }, { 0x61, "\x61" }, { 0x62, "\x62" }, { 0x63, "\x63" }, { 0x64, "\x64" }, { 0x65, "\x65" }, { 0x66, "\x66" }, + { 0x67, "\x67" }, { 0x68, "\x68" }, { 0x69, "\x69" }, { 0x6A, "\x6A" }, { 0x6B, "\x6B" }, { 0x6C, "\x6C" }, { 0x6D, "\x6D" }, { 0x6E, "\x6E" }, { 0x6F, "\x6F" }, { 0x70, "\x70" }, + { 0x71, "\x71" }, { 0x72, "\x72" }, { 0x73, "\x73" }, { 0x74, "\x74" }, { 0x75, "\x75" }, { 0x76, "\x76" }, { 0x77, "\x77" }, { 0x78, "\x78" }, { 0x79, "\x79" }, { 0x7A, "\x7A" }, + { 0x7B, "\x7B" }, { 0x7C, "\x7C" }, { 0x7D, "\x7D" }, { 0x7E, "\x7E" }, { 0xA1, "\xC2\xA1" }, { 0xA2, "\xC2\xA2" }, { 0xA3, "\xC2\xA3" }, { 0xA4, "\xC2\xA4" }, { 0xA5, "\xC2\xA5" }, + { 0xA6, "\xC2\xA6" }, { 0xA7, "\xC2\xA7" }, { 0xA8, "\xC2\xA8" }, { 0xA9, "\xC2\xA9" }, { 0xAA, "\xC2\xAA" }, { 0xAB, "\xC2\xAB" }, { 0xAC, "\xC2\xAC" }, { 0xAE, "\xC2\xAE" }, + { 0xAF, "\xC2\xAF" }, { 0xB0, "\xC2\xB0" }, { 0xB1, "\xC2\xB1" }, { 0xB2, "\xC2\xB2" }, { 0xB3, "\xC2\xB3" }, { 0xB4, "\xC2\xB4" }, { 0xB5, "\xC2\xB5" }, { 0xB6, "\xC2\xB6" }, + { 0xB7, "\xC2\xB7" }, { 0xB8, "\xC2\xB8" }, { 0xB9, "\xC2\xB9" }, { 0xBA, "\xC2\xBA" }, { 0xBB, "\xC2\xBB" }, { 0xBC, "\xC2\xBC" }, { 0xBD, "\xC2\xBD" }, { 0xBE, "\xC2\xBE" }, + { 0xBF, "\xC2\xBF" }, { 0xC0, "\xC3\x80" }, { 0xC1, "\xC3\x81" }, { 0xC2, "\xC3\x82" }, { 0xC3, "\xC3\x83" }, { 0xC4, "\xC3\x84" }, { 0xC5, "\xC3\x85" }, { 0xC6, "\xC3\x86" }, + { 0xC7, "\xC3\x87" }, { 0xC8, "\xC3\x88" }, { 0xC9, "\xC3\x89" }, { 0xCA, "\xC3\x8A" }, { 0xCB, "\xC3\x8B" }, { 0xCC, "\xC3\x8C" }, { 0xCD, "\xC3\x8D" }, { 0xCE, "\xC3\x8E" }, + { 0xCF, "\xC3\x8F" }, { 0xD0, "\xC3\x90" }, { 0xD1, "\xC3\x91" }, { 0xD2, "\xC3\x92" }, { 0xD3, "\xC3\x93" }, { 0xD4, "\xC3\x94" }, { 0xD5, "\xC3\x95" }, { 0xD6, "\xC3\x96" }, + { 0xD7, "\xC3\x97" }, { 0xD8, "\xC3\x98" }, { 0xD9, "\xC3\x99" }, { 0xDA, "\xC3\x9A" }, { 0xDB, "\xC3\x9B" }, { 0xDC, "\xC3\x9C" }, { 0xDD, "\xC3\x9D" }, { 0xDE, "\xC3\x9E" }, + { 0xDF, "\xC3\x9F" }, { 0xE0, "\xC3\xA0" }, { 0xE1, "\xC3\xA1" }, { 0xE2, "\xC3\xA2" }, { 0xE3, "\xC3\xA3" }, { 0xE4, "\xC3\xA4" }, { 0xE5, "\xC3\xA5" }, { 0xE6, "\xC3\xA6" }, + { 0xE7, "\xC3\xA7" }, { 0xE8, "\xC3\xA8" }, { 0xE9, "\xC3\xA9" }, { 0xEA, "\xC3\xAA" }, { 0xEB, "\xC3\xAB" }, { 0xEC, "\xC3\xAC" }, { 0xED, "\xC3\xAD" }, { 0xEE, "\xC3\xAE" }, + { 0xEF, "\xC3\xAF" }, { 0xF0, "\xC3\xB0" }, { 0xF1, "\xC3\xB1" }, { 0xF2, "\xC3\xB2" }, { 0xF3, "\xC3\xB3" }, { 0xF4, "\xC3\xB4" }, { 0xF5, "\xC3\xB5" }, { 0xF6, "\xC3\xB6" }, + { 0xF7, "\xC3\xB7" }, { 0xF8, "\xC3\xB8" }, { 0xF9, "\xC3\xB9" }, { 0xFA, "\xC3\xBA" }, { 0xFB, "\xC3\xBB" }, { 0xFC, "\xC3\xBC" }, { 0xFD, "\xC3\xBD" }, { 0xFE, "\xC3\xBE" }, + { 0xFF, "\xC3\xBF" }, { 0x00, "\xC4\x80" }, { 0x01, "\xC4\x81" }, { 0x02, "\xC4\x82" }, { 0x03, "\xC4\x83" }, { 0x04, "\xC4\x84" }, { 0x05, "\xC4\x85" }, { 0x06, "\xC4\x86" }, + { 0x07, "\xC4\x87" }, { 0x08, "\xC4\x88" }, { 0x09, "\xC4\x89" }, { 0x0A, "\xC4\x8A" }, { 0x0B, "\xC4\x8B" }, { 0x0C, "\xC4\x8C" }, { 0x0D, "\xC4\x8D" }, { 0x0E, "\xC4\x8E" }, + { 0x0F, "\xC4\x8F" }, { 0x10, "\xC4\x90" }, { 0x11, "\xC4\x91" }, { 0x12, "\xC4\x92" }, { 0x13, "\xC4\x93" }, { 0x14, "\xC4\x94" }, { 0x15, "\xC4\x95" }, { 0x16, "\xC4\x96" }, + { 0x17, "\xC4\x97" }, { 0x18, "\xC4\x98" }, { 0x19, "\xC4\x99" }, { 0x1A, "\xC4\x9A" }, { 0x1B, "\xC4\x9B" }, { 0x1C, "\xC4\x9C" }, { 0x1D, "\xC4\x9D" }, { 0x1E, "\xC4\x9E" }, + { 0x1F, "\xC4\x9F" }, { 0x20, "\xC4\xA0" }, { 0x7F, "\xC4\xA1" }, { 0x80, "\xC4\xA2" }, { 0x81, "\xC4\xA3" }, { 0x82, "\xC4\xA4" }, { 0x83, "\xC4\xA5" }, { 0x84, "\xC4\xA6" }, + { 0x85, "\xC4\xA7" }, { 0x86, "\xC4\xA8" }, { 0x87, "\xC4\xA9" }, { 0x88, "\xC4\xAA" }, { 0x89, "\xC4\xAB" }, { 0x8A, "\xC4\xAC" }, { 0x8B, "\xC4\xAD" }, { 0x8C, "\xC4\xAE" }, + { 0x8D, "\xC4\xAF" }, { 0x8E, "\xC4\xB0" }, { 0x8F, "\xC4\xB1" }, { 0x90, "\xC4\xB2" }, { 0x91, "\xC4\xB3" }, { 0x92, "\xC4\xB4" }, { 0x93, "\xC4\xB5" }, { 0x94, "\xC4\xB6" }, + { 0x95, "\xC4\xB7" }, { 0x96, "\xC4\xB8" }, { 0x97, "\xC4\xB9" }, { 0x98, "\xC4\xBA" }, { 0x99, "\xC4\xBB" }, { 0x9A, "\xC4\xBC" }, { 0x9B, "\xC4\xBD" }, { 0x9C, "\xC4\xBE" }, + { 0x9D, "\xC4\xBF" }, { 0x9E, "\xC5\x80" }, { 0x9F, "\xC5\x81" }, { 0xA0, "\xC5\x82" }, { 0xAD, "\xC5\x83" } + }; + return hex_map; + } + + std::unordered_map unicode_to_bytes() { + static std::unordered_map hex_map = { + { "\x21", 0x21 }, { "\x22", 0x22 }, { "\x23", 0x23 }, { "\x24", 0x24 }, { "\x25", 0x25 }, { "\x26", 0x26 }, { "\x27", 0x27 }, { "\x28", 0x28 }, { "\x29", 0x29 }, { "\x2A", 0x2A }, + { "\x2B", 0x2B }, { "\x2C", 0x2C }, { "\x2D", 0x2D }, { "\x2E", 0x2E }, { "\x2F", 0x2F }, { "\x30", 0x30 }, { "\x31", 0x31 }, { "\x32", 0x32 }, { "\x33", 0x33 }, { "\x34", 0x34 }, + { "\x35", 0x35 }, { "\x36", 0x36 }, { "\x37", 0x37 }, { "\x38", 0x38 }, { "\x39", 0x39 }, { "\x3A", 0x3A }, { "\x3B", 0x3B }, { "\x3C", 0x3C }, { "\x3D", 0x3D }, { "\x3E", 0x3E }, + { "\x3F", 0x3F }, { "\x40", 0x40 }, { "\x41", 0x41 }, { "\x42", 0x42 }, { "\x43", 0x43 }, { "\x44", 0x44 }, { "\x45", 0x45 }, { "\x46", 0x46 }, { "\x47", 0x47 }, { "\x48", 0x48 }, + { "\x49", 0x49 }, { "\x4A", 0x4A }, { "\x4B", 0x4B }, { "\x4C", 0x4C }, { "\x4D", 0x4D }, { "\x4E", 0x4E }, { "\x4F", 0x4F }, { "\x50", 0x50 }, { "\x51", 0x51 }, { "\x52", 0x52 }, + { "\x53", 0x53 }, { "\x54", 0x54 }, { "\x55", 0x55 }, { "\x56", 0x56 }, { "\x57", 0x57 }, { "\x58", 0x58 }, { "\x59", 0x59 }, { "\x5A", 0x5A }, { "\x5B", 0x5B }, { "\x5C", 0x5C }, + { "\x5D", 0x5D }, { "\x5E", 0x5E }, { "\x5F", 0x5F }, { "\x60", 0x60 }, { "\x61", 0x61 }, { "\x62", 0x62 }, { "\x63", 0x63 }, { "\x64", 0x64 }, { "\x65", 0x65 }, { "\x66", 0x66 }, + { "\x67", 0x67 }, { "\x68", 0x68 }, { "\x69", 0x69 }, { "\x6A", 0x6A }, { "\x6B", 0x6B }, { "\x6C", 0x6C }, { "\x6D", 0x6D }, { "\x6E", 0x6E }, { "\x6F", 0x6F }, { "\x70", 0x70 }, + { "\x71", 0x71 }, { "\x72", 0x72 }, { "\x73", 0x73 }, { "\x74", 0x74 }, { "\x75", 0x75 }, { "\x76", 0x76 }, { "\x77", 0x77 }, { "\x78", 0x78 }, { "\x79", 0x79 }, { "\x7A", 0x7A }, + { "\x7B", 0x7B }, { "\x7C", 0x7C }, { "\x7D", 0x7D }, { "\x7E", 0x7E }, { "\xC2\xA1", 0xA1 }, { "\xC2\xA2", 0xA2 }, { "\xC2\xA3", 0xA3 }, { "\xC2\xA4", 0xA4 }, { "\xC2\xA5", 0xA5 }, + { "\xC2\xA6", 0xA6 }, { "\xC2\xA7", 0xA7 }, { "\xC2\xA8", 0xA8 }, { "\xC2\xA9", 0xA9 }, { "\xC2\xAA", 0xAA }, { "\xC2\xAB", 0xAB }, { "\xC2\xAC", 0xAC }, { "\xC2\xAE", 0xAE }, + { "\xC2\xAF", 0xAF }, { "\xC2\xB0", 0xB0 }, { "\xC2\xB1", 0xB1 }, { "\xC2\xB2", 0xB2 }, { "\xC2\xB3", 0xB3 }, { "\xC2\xB4", 0xB4 }, { "\xC2\xB5", 0xB5 }, { "\xC2\xB6", 0xB6 }, + { "\xC2\xB7", 0xB7 }, { "\xC2\xB8", 0xB8 }, { "\xC2\xB9", 0xB9 }, { "\xC2\xBA", 0xBA }, { "\xC2\xBB", 0xBB }, { "\xC2\xBC", 0xBC }, { "\xC2\xBD", 0xBD }, { "\xC2\xBE", 0xBE }, + { "\xC2\xBF", 0xBF }, { "\xC3\x80", 0xC0 }, { "\xC3\x81", 0xC1 }, { "\xC3\x82", 0xC2 }, { "\xC3\x83", 0xC3 }, { "\xC3\x84", 0xC4 }, { "\xC3\x85", 0xC5 }, { "\xC3\x86", 0xC6 }, + { "\xC3\x87", 0xC7 }, { "\xC3\x88", 0xC8 }, { "\xC3\x89", 0xC9 }, { "\xC3\x8A", 0xCA }, { "\xC3\x8B", 0xCB }, { "\xC3\x8C", 0xCC }, { "\xC3\x8D", 0xCD }, { "\xC3\x8E", 0xCE }, + { "\xC3\x8F", 0xCF }, { "\xC3\x90", 0xD0 }, { "\xC3\x91", 0xD1 }, { "\xC3\x92", 0xD2 }, { "\xC3\x93", 0xD3 }, { "\xC3\x94", 0xD4 }, { "\xC3\x95", 0xD5 }, { "\xC3\x96", 0xD6 }, + { "\xC3\x97", 0xD7 }, { "\xC3\x98", 0xD8 }, { "\xC3\x99", 0xD9 }, { "\xC3\x9A", 0xDA }, { "\xC3\x9B", 0xDB }, { "\xC3\x9C", 0xDC }, { "\xC3\x9D", 0xDD }, { "\xC3\x9E", 0xDE }, + { "\xC3\x9F", 0xDF }, { "\xC3\xA0", 0xE0 }, { "\xC3\xA1", 0xE1 }, { "\xC3\xA2", 0xE2 }, { "\xC3\xA3", 0xE3 }, { "\xC3\xA4", 0xE4 }, { "\xC3\xA5", 0xE5 }, { "\xC3\xA6", 0xE6 }, + { "\xC3\xA7", 0xE7 }, { "\xC3\xA8", 0xE8 }, { "\xC3\xA9", 0xE9 }, { "\xC3\xAA", 0xEA }, { "\xC3\xAB", 0xEB }, { "\xC3\xAC", 0xEC }, { "\xC3\xAD", 0xED }, { "\xC3\xAE", 0xEE }, + { "\xC3\xAF", 0xEF }, { "\xC3\xB0", 0xF0 }, { "\xC3\xB1", 0xF1 }, { "\xC3\xB2", 0xF2 }, { "\xC3\xB3", 0xF3 }, { "\xC3\xB4", 0xF4 }, { "\xC3\xB5", 0xF5 }, { "\xC3\xB6", 0xF6 }, + { "\xC3\xB7", 0xF7 }, { "\xC3\xB8", 0xF8 }, { "\xC3\xB9", 0xF9 }, { "\xC3\xBA", 0xFA }, { "\xC3\xBB", 0xFB }, { "\xC3\xBC", 0xFC }, { "\xC3\xBD", 0xFD }, { "\xC3\xBE", 0xFE }, + { "\xC3\xBF", 0xFF }, { "\xC4\x80", 0x00 }, { "\xC4\x81", 0x01 }, { "\xC4\x82", 0x02 }, { "\xC4\x83", 0x03 }, { "\xC4\x84", 0x04 }, { "\xC4\x85", 0x05 }, { "\xC4\x86", 0x06 }, + { "\xC4\x87", 0x07 }, { "\xC4\x88", 0x08 }, { "\xC4\x89", 0x09 }, { "\xC4\x8A", 0x0A }, { "\xC4\x8B", 0x0B }, { "\xC4\x8C", 0x0C }, { "\xC4\x8D", 0x0D }, { "\xC4\x8E", 0x0E }, + { "\xC4\x8F", 0x0F }, { "\xC4\x90", 0x10 }, { "\xC4\x91", 0x11 }, { "\xC4\x92", 0x12 }, { "\xC4\x93", 0x13 }, { "\xC4\x94", 0x14 }, { "\xC4\x95", 0x15 }, { "\xC4\x96", 0x16 }, + { "\xC4\x97", 0x17 }, { "\xC4\x98", 0x18 }, { "\xC4\x99", 0x19 }, { "\xC4\x9A", 0x1A }, { "\xC4\x9B", 0x1B }, { "\xC4\x9C", 0x1C }, { "\xC4\x9D", 0x1D }, { "\xC4\x9E", 0x1E }, + { "\xC4\x9F", 0x1F }, { "\xC4\xA0", 0x20 }, { "\xC4\xA1", 0x7F }, { "\xC4\xA2", 0x80 }, { "\xC4\xA3", 0x81 }, { "\xC4\xA4", 0x82 }, { "\xC4\xA5", 0x83 }, { "\xC4\xA6", 0x84 }, + { "\xC4\xA7", 0x85 }, { "\xC4\xA8", 0x86 }, { "\xC4\xA9", 0x87 }, { "\xC4\xAA", 0x88 }, { "\xC4\xAB", 0x89 }, { "\xC4\xAC", 0x8A }, { "\xC4\xAD", 0x8B }, { "\xC4\xAE", 0x8C }, + { "\xC4\xAF", 0x8D }, { "\xC4\xB0", 0x8E }, { "\xC4\xB1", 0x8F }, { "\xC4\xB2", 0x90 }, { "\xC4\xB3", 0x91 }, { "\xC4\xB4", 0x92 }, { "\xC4\xB5", 0x93 }, { "\xC4\xB6", 0x94 }, + { "\xC4\xB7", 0x95 }, { "\xC4\xB8", 0x96 }, { "\xC4\xB9", 0x97 }, { "\xC4\xBA", 0x98 }, { "\xC4\xBB", 0x99 }, { "\xC4\xBC", 0x9A }, { "\xC4\xBD", 0x9B }, { "\xC4\xBE", 0x9C }, + { "\xC4\xBF", 0x9D }, { "\xC5\x80", 0x9E }, { "\xC5\x81", 0x9F }, { "\xC5\x82", 0xA0 }, { "\xC5\x83", 0xAD } + }; + return hex_map; + } + + // len must be available + bool inline str_is_equal(const char* str1, const char* str2, size_t len) { + for (size_t i = 0; i < len; ++i) { + if (str1[i] != str2[i]) { + return false; + } + } + return true; + } + + std::vector bpe_gpt2_preprocess(const std::string& text) { + static std::unordered_map< unsigned char, std::string> byte_encoder = bytes_to_unicode(); + std::vector bpe_words; + std::vector bpe_encoded_words; + + std::string token=""; + const char *raw_text_p = text.c_str(); + // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ + bool collecting_numeric = false; + bool collecting_letter = false; + bool collecting_special = false; + bool collecting_whitespace_lookahead = false; + bool collecting=false; + + std::vector text_utf; + text_utf.reserve(text.size()); + bpe_words.reserve(text.size()); + bpe_encoded_words.reserve(text.size()); + + text_utf = CNCTUnicode::split_utf8_enhanced(text); + + for (int i = 0; i < (int)text_utf.size(); i++) { + const CNCTString &utf_char = text_utf[i]; + bool split_condition = false; + const char *text_pos = raw_text_p + utf_char.seq_offset_bytes; + int bytes_remain = strlen(text_pos); + // forward backward lookups + const CNCTString &utf_char_next = (i+1 < (int)text_utf.size()) ? text_utf[i+1] : CNCTString(); + const CNCTString &utf_char_next_next = (i+2 < (int)text_utf.size()) ? text_utf[i+2] : CNCTString(); + // const CNCTString &utf_char_prev = (i > 0) ? text_utf[i-1] : CNCTString(); + + // handling contractions + if (!split_condition && bytes_remain >= 2) { + // 's|'t|'m|'d + if (utf_char == '\'' && (utf_char_next == 's' || utf_char_next == 't' || utf_char_next == 'm' || utf_char_next == 'd')) { + split_condition = true; + } + if (split_condition) { + if (token.size()) { + bpe_words.emplace_back(token); // push previous content as token + } + token = utf_char.str + utf_char_next.str; + bpe_words.emplace_back(token); + token=""; + i++; + continue; + } + } + if (!split_condition && bytes_remain >= 3) { + // 're|'ve|'ll + if (utf_char == '\'' && ( + (utf_char_next == 'r' || utf_char_next_next == 'e') || + (utf_char_next == 'v' || utf_char_next_next == 'e') || + (utf_char_next == 'l' || utf_char_next_next == 'l')) + ) { + split_condition = true; + } + if (split_condition) { + // current token + next token can be defined + if (token.size()) { + bpe_words.emplace_back(token); // push previous content as token + } + token = utf_char.str + utf_char_next.str + utf_char_next_next.str; + bpe_words.emplace_back(token); // the contraction + token=""; + i+=2; + continue; + } + } + + if (!split_condition && !collecting) { + if (utf_char.char_type == CNCTCharType::LETTER || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::LETTER)) { + collecting_letter = true; + collecting = true; + } else if (utf_char.char_type == CNCTCharType::DIGIT || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::DIGIT)) { + collecting_numeric = true; + collecting = true; + } else if ( + ((utf_char.char_type != CNCTCharType::LETTER && utf_char.char_type != CNCTCharType::DIGIT) && (utf_char.char_type != CNCTCharType::WHITESPACE)) || + (!token.size() && utf_char==" " && utf_char_next.char_type != CNCTCharType::LETTER && utf_char_next.char_type != CNCTCharType::DIGIT && utf_char_next.char_type != CNCTCharType::WHITESPACE) + ) { + collecting_special = true; + collecting = true; + } else if (utf_char.char_type == CNCTCharType::WHITESPACE && utf_char_next.char_type == CNCTCharType::WHITESPACE) { + collecting_whitespace_lookahead = true; + collecting = true; + } else if (utf_char.char_type == CNCTCharType::WHITESPACE) { + split_condition = true; + } + } else if (!split_condition && collecting) { + if (collecting_letter && utf_char.char_type != CNCTCharType::LETTER) { + split_condition = true; + } else if (collecting_numeric && utf_char.char_type != CNCTCharType::DIGIT) { + split_condition = true; + } else if (collecting_special && (utf_char.char_type == CNCTCharType::LETTER || utf_char.char_type == CNCTCharType::DIGIT || utf_char.char_type == CNCTCharType::WHITESPACE)) { + split_condition = true; + } else if (collecting_whitespace_lookahead && utf_char_next.char_type != CNCTCharType::WHITESPACE) { + split_condition = true; + } + } + + if(utf_char_next.str.size() == 0) { + split_condition = true; // final + token += utf_char.str; + } + + if (split_condition) { + if (token.size()) { + bpe_words.emplace_back(token); + } + token = utf_char.str; + collecting = false; + collecting_letter = false; + collecting_numeric = false; + collecting_special = false; + collecting_whitespace_lookahead = false; + } else { + token += utf_char.str; + } + } + + for (std::string& word : bpe_words) { + std::string encoded_token=""; + for (char& c : word) { + encoded_token += byte_encoder[c]; + } + bpe_encoded_words.emplace_back(encoded_token); + } + + return bpe_encoded_words; + } + + // decoder (for one token) + std::string decode_token(const std::string& token) { + static std::unordered_map< std::string, unsigned char> byte_decoder = unicode_to_bytes(); + std::string decoded_token=""; + auto unicode_seqeunces = CNCTUnicode::split_utf8(token); + for (auto& unicode_sequence : unicode_seqeunces) { + decoded_token += byte_decoder[unicode_sequence]; + } + + return decoded_token; + } + + const gpt2bpe_vocab & vocab_; + std::vector symbols_; + std::vector symbols_final; + ggllm_bpe_bigram::queue work_queue_; + bool flag_g2ws=false; +}; + +static std::vector gpt2bpe_tokenize(const gpt2bpe_vocab & vocab, const std::string & text, bool bos, bool g2ws ) { + gpt2bpe_tokenizer tokenizer(vocab, g2ws); + std::vector output; + + if (text.empty()) { + return output; + } + + if (bos && vocab.special_bos_id != -1) { + output.push_back(vocab.special_bos_id); + } + + tokenizer.tokenize(text, output); + return output; +} + +#endif // CMPNCT_GPT2BPE diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7f9a1620b60bf7e70136233c6befc8021116f2b9 --- /dev/null +++ b/examples/gptneox-wip/falcon-main.cpp @@ -0,0 +1,1111 @@ +#include "ggml.h" +#include "cmpnct_gpt2bpe.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +// default hparams +struct falcon_hparams { + size_t n_merges = 0; + size_t n_vocab = 0; + uint32_t n_ctx = 0; + uint32_t n_embd = 0; + uint32_t n_head = 0; + uint32_t n_head_kv = 1; // Needs to be 1 for 7B model + uint32_t n_ff = 0; + uint32_t n_block = 0; + float norm_eps = 1e-5; +}; +struct falcon_block { + // normalization + struct ggml_tensor* input_layernorm; + struct ggml_tensor* input_layernorm_b; + struct ggml_tensor* attention_norm; // Falcon-40B only + struct ggml_tensor* attention_norm_b; // Falcon-40B only + + // attention + struct ggml_tensor* query_key_value; + struct ggml_tensor* wo; + + // ff + struct ggml_tensor* ffn_up; + struct ggml_tensor* ffn_down; +}; + +struct falcon_model { + falcon_hparams hparams; + + struct ggml_tensor* tok_embeddings; + struct ggml_tensor* output_norm; + struct ggml_tensor* output_norm_b; + struct ggml_tensor* lm_head; + + std::vector blocks; + + // key + value memory + struct ggml_tensor* memory_k; + struct ggml_tensor* memory_v; + + struct gguf_context * ggufctx; + struct ggml_context * ctx; + struct ggml_context * kvctx; + + std::map tensors; +}; + +struct gpt_params { + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + uint32_t n_predict = 200; // new tokens to predict + uint32_t n_batch = 512; // batch size for prompt processing + + // sampling parameters + int32_t top_k = 40; + float top_p = 1.0f; + float temp = 0.8f; + int32_t repeat_last_n = 64; + float repeat_penalty = 1.02f; + + std::string model = ""; // model path + std::string prompt = ""; + + std::string token_test = ""; + bool interactive = false; + int32_t interactive_port = -1; + int32_t n_gpu_layers = 0; +}; + +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); + fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stderr, " prompt to start generation with (default: random)\n"); + fprintf(stderr, " -f FNAME, --file FNAME\n"); + fprintf(stderr, " load prompt from a file\n"); + fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); + fprintf(stderr, " test tokenization\n"); + fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); + fprintf(stderr, " --top_k N top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k); + fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); + fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); + fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); + fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, "\n"); +} + +// Function to check if the next argument exists +std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { + if (i + 1 < argc && argv[i + 1][0] != '-') { + return argv[++i]; + } else { + fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); + gpt_print_usage(argc, argv, params); + exit(0); + } +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-s" || arg == "--seed") { + params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-t" || arg == "--threads") { + params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { + params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-p" || arg == "--prompt") { + params.prompt = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-n" || arg == "--n_predict") { + params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--top_k") { + params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--top_p") { + params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--temp") { + params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--repeat-last-n") { + params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--repeat-penalty") { + params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-b" || arg == "--batch_size") { + params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-m" || arg == "--model") { + params.model = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + } else if (arg == "-ip" || arg == "--interactive-port") { + params.interactive = true; + params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, params); + exit(0); + } else if (arg == "-f" || arg == "--file") { + get_next_arg(i, argc, argv, arg, params); + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + break; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } else if (arg == "-tt" || arg == "--token_test") { + params.token_test = get_next_arg(i, argc, argv, arg, params); + } + else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(0); + } + } + + return true; +} + +gpt2bpe_vocab::id sample_top_k_top_p_repeat( + const gpt2bpe_vocab & vocab, + const float * logits, + const int32_t * last_n_tokens_data, + size_t last_n_tokens_data_size, + int top_k, + double top_p, + double temp, + int repeat_last_n, + float repeat_penalty, + std::mt19937 & rng) { + + int n_logits = vocab.id_to_token.size(); + + const auto * plogits = logits; + + const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); + + if (temp <= 0) { + // select the token with the highest logit directly + float max_logit = plogits[0]; + gpt2bpe_vocab::id max_id = 0; + + for (int i = 1; i < n_logits; ++i) { + if (plogits[i] > max_logit) { + max_logit = plogits[i]; + max_id = i; + } + } + return max_id; + } + + + std::vector> logits_id; + logits_id.reserve(n_logits); + + { + const float scale = 1.0f/temp; + for (int i = 0; i < n_logits; ++i) { + // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) + // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main + if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { + // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability + if (plogits[i] < 0.0f) { + logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); + } else { + logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); + } + } else { + logits_id.push_back(std::make_pair(plogits[i]*scale, i)); + } + } + } + + // find the top K tokens + std::partial_sort( + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + logits_id.resize(top_k); + + double maxl = -INFINITY; + for (const auto & kv : logits_id) { + maxl = std::max(maxl, kv.first); + } + + // compute probs for the top K tokens + std::vector probs; + probs.reserve(logits_id.size()); + + double sum = 0.0; + for (const auto & kv : logits_id) { + double p = exp(kv.first - maxl); + probs.push_back(p); + sum += p; + } + + // normalize the probs + for (auto & p : probs) { + p /= sum; + } + + if (top_p < 1.0f) { + double cumsum = 0.0f; + for (int i = 0; i < top_k; i++) { + cumsum += probs[i]; + if (cumsum >= top_p) { + top_k = i + 1; + probs.resize(top_k); + logits_id.resize(top_k); + break; + } + } + + cumsum = 1.0/cumsum; + for (int i = 0; i < (int) probs.size(); i++) { + probs[i] *= cumsum; + } + } + +// printf("\n"); +// for (int i = 0; i < (int) probs.size(); i++) { +// for (int i = 0; i < 10; i++) { +// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); +// } + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + int idx = dist(rng); + + return logits_id[idx].second; + +} + +struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){ + + struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); + if( cur == NULL ) { + printf("%s: tensor '%s' not found!\n", __func__, name.c_str()); + } else { +// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); + } + + return cur; +} + +// load the model's weights from a file +bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_vocab & vocab) { + printf("%s: loading model from '%s'..\n", __func__, fname.c_str()); + + model.ctx = NULL; + + struct gguf_init_params ggufparams = { + /*.no_alloc = */ false, + /*.ctx = */ &model.ctx, + }; + + auto & ggufctx = model.ggufctx; + + ggufctx = gguf_init_from_file(fname.c_str(), ggufparams); + + if (!ggufctx) { + fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__); + return false; + } + + printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); + printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); + printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); + + // print all kv + #if 0 + { + const int n_kv = gguf_get_n_kv(ggufctx); + + printf("%s: n_kv: %d\n", __func__, n_kv); + + for (int i = 0; i < n_kv; ++i) { + const char * key = gguf_get_key(ggufctx, i); + + printf("%s: kv[%d]: key = %s\n", __func__, i, key); + } + } + #endif + + // print some standard metadata + { + int keyidx; + + keyidx = gguf_find_key(ggufctx, "general.name"); + if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.description"); + if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.author"); + if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.license"); + if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.architecture"); + if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.file_type"); + if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); + if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); + if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + } + + // check required metadata + { + int keyidx; + + // check model architecture kv + keyidx = gguf_find_key(ggufctx, "general.architecture"); + if (keyidx != -1) { + if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) { + printf("%s: model architecture not supported!\n", __func__); + return false; + } + } else { + printf("%s: gguf model architecture not found!\n", __func__); + return false; + } + + // check model tensor data layout kv + keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout"); + if (keyidx != -1) { + if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) { + printf("%s: model tensor data layout not supported!\n", __func__); + return false; + } + } else { + printf("%s: gguf model tensor data layout not found!\n", __func__); + return false; + } + + } + + // load hparams + { + auto & hparams = model.hparams; + + bool ok = true; + int keyidx; + + if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.context_length"); + if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.embedding_length"); + if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count"); + if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.feed_forward_length"); + if (keyidx != -1) { hparams.n_ff = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.block_count"); + if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.layer_norm_epsilon"); + if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; } } + + if (!ok) { + fprintf(stderr, "%s: required hparam missing!\n", __func__); + return false; + } + + keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count_kv"); + if (keyidx != -1) { hparams.n_head_kv = gguf_get_val_u32(ggufctx, keyidx); } + + + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_head_kv = %d\n", __func__, hparams.n_head_kv); + printf("%s: n_block = %d\n", __func__, hparams.n_block); + printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps); + + } + + // load vocab + { + auto & hparams = model.hparams; + + int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model"); + + if (keyidx != -1) { + if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { + printf("%s: tokenizer model not supported!\n", __func__); + return false; + } + } else { + printf("%s: tokenizer model not found!\n", __func__); + return false; + } + + + int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); + + if (tokens_keyidx == -1) { + printf("%s: gpt2 tokenizer vocab not found!\n", __func__); + return false; + } + + int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); + + if (merges_keyidx == -1) { + printf("%s: gpt2 tokenizer merges not found!\n", __func__); + return false; + } + + hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); + hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); + + printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); + printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); + + for (size_t i = 0; i < hparams.n_vocab; i++) { + std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); + +// printf("token %d = '%s'\n",i,word.c_str() ); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + + if( vocab.id_to_token[i] == "\n" ) { + vocab.linefeed_id = i; + } + } + + std::vector> bpe_merges; + + for (size_t i = 0; i < hparams.n_merges; i++) { + + std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i); + + // Split the merges + std::string first, second; + size_t pos = word.find(' ', 1); // Start the search from the second character + if (pos != std::string::npos) { + first = word.substr(0, pos); + second = word.substr(pos + 1); + } + + bpe_merges.push_back(std::make_pair(first, second)); + } + + vocab.populate_bpe_ranks(bpe_merges); + + + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) { vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) { vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) { vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + + if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } + if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } + if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } + if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } + if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } + if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); } + + } + + + auto & ctx = model.ctx; + size_t ctx_size = ggml_get_mem_size(ctx); + + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + + // print tensor info + #if 0 + { + const int n_tensors = gguf_get_n_tensors(ggufctx); + + printf("%s: n_tensors: %d\n", __func__, n_tensors); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name (ggufctx, i); + const size_t offset = gguf_get_tensor_offset(ggufctx, i); + + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + } + } + #endif + + // prepare memory for the weights + { + + auto & hparams = model.hparams; + + const int n_block = hparams.n_block; + + model.blocks.resize(n_block); + + model.tok_embeddings = ggml_get_tensor(ctx, "token_embd.weight"); + + model.output_norm = ggml_get_tensor(ctx, "output_norm.weight"); + model.output_norm_b = ggml_get_tensor(ctx, "output_norm.bias"); + model.lm_head = ggml_get_tensor(ctx, "output.weight"); + + // map by name + model.tensors["token_embd.weight"] = model.tok_embeddings; + model.tensors["output_norm.weight"] = model.output_norm; + model.tensors["output_norm.bias"] = model.output_norm_b; + model.tensors["output.weight"] = model.lm_head; + + for (int i = 0; i < n_block; ++i) { + + auto& block = model.blocks[i]; + std::string blocknamestart = "blk." + std::to_string(i) + "."; + + block.input_layernorm = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" ); + block.input_layernorm_b = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" ); + + if ( hparams.n_head_kv == 8 ) { // Falcon-40B + block.attention_norm = get_tensor_ex(ctx, blocknamestart + "attn_norm_2.weight" ); + block.attention_norm_b = get_tensor_ex(ctx, blocknamestart + "attn_norm_2.bias" ); + } + + // query_key_value shape for config.multi_query == True: + block.query_key_value = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" ); + block.wo = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" ); + + block.ffn_up = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" ); + block.ffn_down = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" ); + + // map by name + if ( hparams.n_head_kv == 8 ) { // Falcon-40B + // Falcon-40B: + model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm; + model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b; + model.tensors[blocknamestart + "attn_norm_2.weight"] = block.attention_norm; + model.tensors[blocknamestart + "attn_norm_2.bias"] = block.attention_norm_b; + } else { + // Falcon-7B: + model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm; + model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b; + } + + model.tensors[blocknamestart + "attn_qkv.weight"] = block.query_key_value; + model.tensors[blocknamestart + "attn_output.weight"] = block.wo; + + model.tensors[blocknamestart + "ffn_up.weight"] = block.ffn_up; + model.tensors[blocknamestart + "ffn_down.weight"] = block.ffn_down; + } + } + + // key + value memory + { + const auto & kvctx = model.kvctx; + const auto & hparams = model.hparams; + + const int n_block = hparams.n_block; + const int n_ctx = hparams.n_ctx; + const int n_embd = hparams.n_embd; + + const int64_t n_mem = n_block*n_ctx; + const int64_t n_elements = n_embd*n_mem; + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ size_t(n_elements*4+ggml_tensor_overhead()*2), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.kvctx = ggml_init(params); + if (!model.kvctx) { + fprintf(stderr, "%s: kv ggml_init() failed\n", __func__); + return false; + } + + } + + + model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + return true; +} + + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool falcon_eval( + const falcon_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { + + + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_block = hparams.n_block; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_head_kv = hparams.n_head_kv; + const int n_vocab = hparams.n_vocab; + const size_t head_dim = n_embd / n_head; + + static size_t buf_size = 256u*1024*1024; + static void * buf = malloc(buf_size); + + // use 2 scratch buffers + // TODO: very hacky solution - reimplement in a more elegant way + static size_t scr0_size = 256u*1024*1024; + static void * scr0 = malloc(scr0_size); + + static size_t scr1_size = 256u*1024*1024; + static void * scr1 = malloc(scr1_size); + + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; +// gf.n_threads = n_threads; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + // wte + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); +// struct ggml_tensor* repeat_dummy = ggml_new_tensor_3d(ctx0, inpL->type, head_dim, N + n_past, n_head); + + ggml_type wtype = GGML_TYPE_F32; + const int sizeof_wtype = ggml_type_sizef(wtype); + + for (int il = 0; il < n_block; ++il) { + struct ggml_tensor * cur; + struct ggml_tensor * layernorm_output; + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // self-attention + { + layernorm_output = ggml_norm(ctx0, inpL); + + layernorm_output = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.blocks[il].input_layernorm, layernorm_output), + layernorm_output), + ggml_repeat(ctx0, model.blocks[il].input_layernorm_b, layernorm_output)); + + if ( hparams.n_head_kv == 8 ) { // Falcon-40B + cur = ggml_norm(ctx0, inpL); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.blocks[il].attention_norm, cur), + cur), + ggml_repeat(ctx0, model.blocks[il].attention_norm_b, cur)); + } + else { // Falcon 7B + cur = layernorm_output; + } + + // compute QKV + + cur = ggml_mul_mat(ctx0, model.blocks[il].query_key_value, cur); + + // Note that the strides for Kcur, Vcur are set up so that the + // resulting views are misaligned with the tensor's storage + // (by applying the K/V offset we shift the tensor's original + // view to stick out behind the viewed QKV tensor's allocated + // memory, so to say). This is ok because no actual accesses + // happen to that out-of-range memory, but it can require some + // trickery when trying to accurately dump these views for + // debugging. + + struct ggml_tensor * Qcur = ggml_view_3d( + ctx0, cur, head_dim, n_head, N, + head_dim * sizeof_wtype, + head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, + 0); + + struct ggml_tensor * Kcur = ggml_view_3d( + ctx0, cur, head_dim, n_head_kv, N, + head_dim * sizeof_wtype, + head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, + head_dim * n_head * sizeof_wtype); + + struct ggml_tensor * Vcur = ggml_view_3d( + ctx0, cur, head_dim, n_head_kv, N, + head_dim * sizeof_wtype, + head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype, + head_dim * (n_head + n_head_kv) * sizeof_wtype); + + // using mode = 2 for neox mode + Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, 0); + Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, 0); + + // store key and value to memory + { + struct ggml_tensor* k = ggml_view_1d( + ctx0, model.memory_k, N * n_head_kv * head_dim, + (ggml_element_size(model.memory_k) * n_head_kv * head_dim) * + (il * n_ctx + n_past)); + struct ggml_tensor* v = ggml_view_1d( + ctx0, model.memory_v, N * n_head_kv * head_dim, + (ggml_element_size(model.memory_v) * n_head_kv * head_dim) * + (il * n_ctx + n_past)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * K = ggml_permute( + ctx0, + ggml_reshape_3d( + ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim, + il * n_ctx * + ggml_element_size(model.memory_k) * + n_head_kv * + head_dim), + head_dim, n_head_kv, n_past + N), + 0, 2, 1, 3); + + // K * Q + +// K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy)); + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(head_dim))) + ); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + struct ggml_tensor* V = ggml_permute( + ctx0, + ggml_reshape_3d( + ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim, + il * n_ctx * + ggml_element_size(model.memory_v) * + n_head_kv * + head_dim), + head_dim, n_head_kv, n_past + N), + 0, 2, 1, 3); + +// V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy))); + V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); + + // KQV = transpose(V) * KQ_soft_max + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection + { + cur = ggml_mul_mat(ctx0, + model.blocks[il].wo, + cur); + } + } + + ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + + struct ggml_tensor* inpFF = layernorm_output; + struct ggml_tensor* attn_out = ggml_cpy( + ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + { + cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_up, inpFF); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_down, cur); + } + + cur = ggml_add(ctx0, cur, attn_out); + cur = ggml_add(ctx0, cur, inpL); + // input for next layer + inpL = cur; + } + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // norm + { + inpL = ggml_norm(ctx0, inpL); + + // inpL = ln_f_g*inpL + ln_f_b + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.output_norm, inpL), + inpL), + ggml_repeat(ctx0, model.output_norm_b, inpL)); + } + + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + + // lm_head + { + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + + //inpL = ggml_add(ctx0, + // ggml_repeat(ctx0, model.lmh_b, inpL), + // inpL); + } + + // logits -> probs + //inpL = ggml_soft_max_inplace(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); +// ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + // return result for just the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + + ggml_free(ctx0); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + int64_t t_load_us = 0; + + gpt2bpe_vocab vocab; + falcon_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!falcon_model_load(params.model, model, vocab)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + } + + if (params.seed < 0) { + params.seed = time(NULL); + } + + if (params.top_k == 0) { + params.top_k = model.hparams.n_vocab; + } + + printf("%s: seed = %d\n", __func__, params.seed); + printf("%s: temp = %.3f\n", __func__, params.temp); + printf("%s: top_k = %d\n", __func__, params.top_k); + printf("%s: top_p = %.3f\n", __func__, params.top_p); + printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); + printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); + + std::mt19937 rng(params.seed); + + if (params.prompt.empty()) { + params.prompt = "Once upon"; + } + + std::vector last_n_tokens(model.hparams.n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); +// for (size_t i = 0; i < embd_inp.size(); i++) { +// printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str()); +// } + + if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) { + params.n_predict = model.hparams.n_ctx-embd_inp.size(); + } + + printf("%s: n_predict = %d\n", __func__, params.n_predict); + printf("\n"); + + std::vector embd; + + // determine the required inference memory per token: + size_t mem_per_token = 0; + falcon_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!falcon_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { + printf("Failed to predict\n"); + return 1; + } + + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + const int top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + const int repeat_last_n = params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + + const int n_vocab = model.hparams.n_vocab; + + gpt2bpe_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (size_t k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (embd.size() > params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str() ); + } + fflush(stdout); + + // end of text token + if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + return 0; +} diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..55eba0cdcfdfb78885c7ef7852d8fdcb215d324c --- /dev/null +++ b/examples/gptneox-wip/gptneox-main.cpp @@ -0,0 +1,1083 @@ +#include "ggml.h" +#include "cmpnct_gpt2bpe.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +// default hparams +struct gpt_neox_hparams { + size_t n_merges = 0; + size_t n_vocab = 0; + uint32_t n_ctx = 0; + uint32_t n_embd = 0; + uint32_t n_head = 0; + uint32_t n_block = 0; + uint32_t n_rot = 0; // rotary_pct * (n_embd / n_head) + bool par_res = true; + float norm_eps = 1e-5; +}; + +struct gpt_neox_block { + // pre normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + // attention + struct ggml_tensor * c_attn_attn_w; + struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // post normalization + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // ff + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; +}; + +struct gpt_neox_model { + gpt_neox_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + + struct ggml_tensor * lmh_g; // language model head + + std::vector blocks; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct gguf_context * ggufctx; + struct ggml_context * ctx; + struct ggml_context * kvctx; + + std::map tensors; +}; + +struct gpt_params { + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + uint32_t n_predict = 200; // new tokens to predict + uint32_t n_batch = 512; // batch size for prompt processing + + // sampling parameters + int32_t top_k = 40; + float top_p = 1.0f; + float temp = 0.8f; + int32_t repeat_last_n = 64; + float repeat_penalty = 1.02f; + + std::string model = ""; // model path + std::string prompt = ""; + + std::string token_test = ""; + bool interactive = false; + int32_t interactive_port = -1; + int32_t n_gpu_layers = 0; +}; + +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); + fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stderr, " prompt to start generation with (default: random)\n"); + fprintf(stderr, " -f FNAME, --file FNAME\n"); + fprintf(stderr, " load prompt from a file\n"); + fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); + fprintf(stderr, " test tokenization\n"); + fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); + fprintf(stderr, " --top_k N top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k); + fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); + fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); + fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); + fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, "\n"); +} + +// Function to check if the next argument exists +std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { + if (i + 1 < argc && argv[i + 1][0] != '-') { + return argv[++i]; + } else { + fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); + gpt_print_usage(argc, argv, params); + exit(0); + } +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-s" || arg == "--seed") { + params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-t" || arg == "--threads") { + params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { + params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-p" || arg == "--prompt") { + params.prompt = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-n" || arg == "--n_predict") { + params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--top_k") { + params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--top_p") { + params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--temp") { + params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--repeat-last-n") { + params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--repeat-penalty") { + params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-b" || arg == "--batch_size") { + params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-m" || arg == "--model") { + params.model = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + } else if (arg == "-ip" || arg == "--interactive-port") { + params.interactive = true; + params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, params); + exit(0); + } else if (arg == "-f" || arg == "--file") { + get_next_arg(i, argc, argv, arg, params); + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + break; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } else if (arg == "-tt" || arg == "--token_test") { + params.token_test = get_next_arg(i, argc, argv, arg, params); + } + else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(0); + } + } + + return true; +} + +gpt2bpe_vocab::id sample_top_k_top_p_repeat( + const gpt2bpe_vocab & vocab, + const float * logits, + const int32_t * last_n_tokens_data, + size_t last_n_tokens_data_size, + int top_k, + double top_p, + double temp, + int repeat_last_n, + float repeat_penalty, + std::mt19937 & rng) { + + int n_logits = vocab.id_to_token.size(); + + const auto * plogits = logits; + + const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); + + if (temp <= 0) { + // select the token with the highest logit directly + float max_logit = plogits[0]; + gpt2bpe_vocab::id max_id = 0; + + for (int i = 1; i < n_logits; ++i) { + if (plogits[i] > max_logit) { + max_logit = plogits[i]; + max_id = i; + } + } + return max_id; + } + + + std::vector> logits_id; + logits_id.reserve(n_logits); + + { + const float scale = 1.0f/temp; + for (int i = 0; i < n_logits; ++i) { + // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) + // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main + if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { + // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability + if (plogits[i] < 0.0f) { + logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); + } else { + logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); + } + } else { + logits_id.push_back(std::make_pair(plogits[i]*scale, i)); + } + } + } + + // find the top K tokens + std::partial_sort( + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + logits_id.resize(top_k); + + double maxl = -INFINITY; + for (const auto & kv : logits_id) { + maxl = std::max(maxl, kv.first); + } + + // compute probs for the top K tokens + std::vector probs; + probs.reserve(logits_id.size()); + + double sum = 0.0; + for (const auto & kv : logits_id) { + double p = exp(kv.first - maxl); + probs.push_back(p); + sum += p; + } + + // normalize the probs + for (auto & p : probs) { + p /= sum; + } + + if (top_p < 1.0f) { + double cumsum = 0.0f; + for (int i = 0; i < top_k; i++) { + cumsum += probs[i]; + if (cumsum >= top_p) { + top_k = i + 1; + probs.resize(top_k); + logits_id.resize(top_k); + break; + } + } + + cumsum = 1.0/cumsum; + for (int i = 0; i < (int) probs.size(); i++) { + probs[i] *= cumsum; + } + } + +// printf("\n"); +// for (int i = 0; i < (int) probs.size(); i++) { +// for (int i = 0; i < 10; i++) { +// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); +// } + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + int idx = dist(rng); + + return logits_id[idx].second; + +} + +struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){ + + struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); + if( cur == NULL ) { + printf("%s: tensor '%s' not found!\n", __func__, name.c_str()); + } else { +// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); + } + + return cur; +} + +// load the model's weights from a file +bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2bpe_vocab & vocab) { + printf("%s: loading model from '%s'..\n", __func__, fname.c_str()); + + model.ctx = NULL; + + struct gguf_init_params ggufparams = { + /*.no_alloc = */ false, + /*.ctx = */ &model.ctx, + }; + + auto & ggufctx = model.ggufctx; + + ggufctx = gguf_init_from_file(fname.c_str(), ggufparams); + + if (!ggufctx) { + fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__); + return false; + } + + printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); + printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); + printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); + + // print all kv + #if 0 + { + const int n_kv = gguf_get_n_kv(ggufctx); + + printf("%s: n_kv: %d\n", __func__, n_kv); + + for (int i = 0; i < n_kv; ++i) { + const char * key = gguf_get_key(ggufctx, i); + + printf("%s: kv[%d]: key = %s\n", __func__, i, key); + } + } + #endif + + // print some standard metadata + { + int keyidx; + + keyidx = gguf_find_key(ggufctx, "general.name"); + if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.description"); + if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.author"); + if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.license"); + if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.architecture"); + if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.file_type"); + if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); + if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); + if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + } + + // check required metadata + { + int keyidx; + + // check model architecture kv + keyidx = gguf_find_key(ggufctx, "general.architecture"); + if (keyidx != -1) { + if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) { + printf("%s: model architecture not supported!\n", __func__); + return false; + } + } else { + printf("%s: gguf model architecture not found!\n", __func__); + return false; + } + + } + + // load hparams + { + auto & hparams = model.hparams; + + bool ok = true; + int keyidx; + + if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.context_length"); + if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.embedding_length"); + if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.head_count"); + if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.block_count"); + if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.rope.dimension_count"); + if (keyidx != -1) { hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.use_parallel_residual"); + if (keyidx != -1) { hparams.par_res = gguf_get_val_bool(ggufctx, keyidx); } else { ok = false; } } + + if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.layer_norm_epsilon"); + if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; } } + + if (!ok) { + fprintf(stderr, "%s: required hparam missing!\n", __func__); + return false; + } + + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_block = %d\n", __func__, hparams.n_block); + printf("%s: n_rot = %d\n", __func__, hparams.n_rot); + printf("%s: par_res = %d\n", __func__, hparams.par_res); + printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps); + + } + + // load vocab + { + auto & hparams = model.hparams; + + int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model"); + + if (keyidx != -1) { + if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { + printf("%s: tokenizer model not supported!\n", __func__); + return false; + } + } else { + printf("%s: tokenizer model not found!\n", __func__); + return false; + } + + + int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); + + if (tokens_keyidx == -1) { + printf("%s: gpt2 tokenizer vocab not found!\n", __func__); + return false; + } + + int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); + + if (merges_keyidx == -1) { + printf("%s: gpt2 tokenizer merges not found!\n", __func__); + return false; + } + + hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); + hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); + + printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); + printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); + + for (size_t i = 0; i < hparams.n_vocab; i++) { + std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); + +// printf("token %d = '%s'\n",i,word.c_str() ); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + + if( vocab.id_to_token[i] == "\n" ) { + vocab.linefeed_id = i; + } + } + + std::vector> bpe_merges; + + for (size_t i = 0; i < hparams.n_merges; i++) { + + std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i); + + // Split the merges + std::string first, second; + size_t pos = word.find(' ', 1); // Start the search from the second character + if (pos != std::string::npos) { + first = word.substr(0, pos); + second = word.substr(pos + 1); + } + + bpe_merges.push_back(std::make_pair(first, second)); + } + + vocab.populate_bpe_ranks(bpe_merges); + + + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) { vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) { vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) { vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } + + if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } + if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } + if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } + if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } + if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } + if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); } + } + + + auto & ctx = model.ctx; + size_t ctx_size = ggml_get_mem_size(ctx); + + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + + // print tensor info + #if 0 + { + const int n_tensors = gguf_get_n_tensors(ggufctx); + + printf("%s: n_tensors: %d\n", __func__, n_tensors); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name (ggufctx, i); + const size_t offset = gguf_get_tensor_offset(ggufctx, i); + + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + } + } + #endif + + // prepare memory for the weights + { + const int n_block = model.hparams.n_block; + + model.blocks.resize(n_block); + + model.wte = ggml_get_tensor(ctx, "token_embd.weight"); + model.ln_f_g = ggml_get_tensor(ctx, "output_norm.weight"); + model.ln_f_b = ggml_get_tensor(ctx, "output_norm.bias"); + model.lmh_g = ggml_get_tensor(ctx, "output.weight"); + + // map by name + model.tensors["token_embd.weight"] = model.wte; + model.tensors["output_norm.weight"] = model.ln_f_g; + model.tensors["output_norm.bias"] = model.ln_f_b; + model.tensors["output.weight"] = model.lmh_g; + + for (int i = 0; i < n_block; ++i) { + auto & block = model.blocks[i]; + + std::string blocknamestart = "blk." + std::to_string(i) + "."; + + block.ln_1_g = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" ); + block.ln_1_b = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" ); + + block.c_attn_attn_w = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" ); + block.c_attn_attn_b = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" ); + + block.c_attn_proj_w = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" ); + block.c_attn_proj_b = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" ); + + block.ln_2_g = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" ); + block.ln_2_b = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias"); + + block.c_mlp_fc_w = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" ); + block.c_mlp_fc_b = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" ); + + block.c_mlp_proj_w = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" ); + block.c_mlp_proj_b = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" ); + + // map by name + model.tensors[blocknamestart + "attn_norm.weight"] = block.ln_1_g; + model.tensors[blocknamestart + "attn_norm.bias"] = block.ln_1_b; + + model.tensors[blocknamestart + "attn_qkv.weight"] = block.c_attn_attn_w; + model.tensors[blocknamestart + "attn_qkv.bias"] = block.c_attn_attn_b; + + model.tensors[blocknamestart + "attn_output.weight"] = block.c_attn_proj_w; + model.tensors[blocknamestart + "attn_output.bias"] = block.c_attn_proj_b; + + model.tensors[blocknamestart + "ffn_norm.weight"] = block.ln_2_g; + model.tensors[blocknamestart + "ffn_norm.bias"] = block.ln_2_b; + + model.tensors[blocknamestart + "ffn_up.weight"] = block.c_mlp_fc_w; + model.tensors[blocknamestart + "ffn_up.bias"] = block.c_mlp_fc_b; + + model.tensors[blocknamestart + "ffn_down.weight"] = block.c_mlp_proj_w; + model.tensors[blocknamestart + "ffn_down.bias"] = block.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & kvctx = model.kvctx; + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_block = hparams.n_block; + const int n_ctx = hparams.n_ctx; + + const int64_t n_mem = n_block*n_ctx; + const int64_t n_elements = n_embd*n_mem; + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ size_t(n_elements*4+ggml_tensor_overhead()*2), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + model.kvctx = ggml_init(params); + if (!model.kvctx) { + fprintf(stderr, "%s: kv ggml_init() failed\n", __func__); + return false; + } + + } + + + model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + return true; +} + + +// feed-forward network +ggml_tensor * gpt_neox_ff( + const gpt_neox_block &block, + ggml_context * ctx0, + ggml_tensor * inp, + const gpt_neox_hparams &hparams) { + + ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps); + + cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur)); + cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur); + cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_fc_b, cur), cur); + + // GELU activation + cur = ggml_gelu(ctx0, cur); + + // projection + // cur = proj_w*cur + proj_b + cur = ggml_mul_mat(ctx0, block.c_mlp_proj_w, cur); + + cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_proj_b, cur), cur); + return cur; +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt_neox_eval( + const gpt_neox_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_block = hparams.n_block; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_vocab; + const int n_rot = hparams.n_rot; + + static size_t buf_size = 256u*1024*1024; + static void * buf = malloc(buf_size); + + // use 2 scratch buffers + // TODO: very hacky solution - reimplement in a more elegant way + static size_t scr0_size = 256u*1024*1024; + static void * scr0 = malloc(scr0_size); + + static size_t scr1_size = 256u*1024*1024; + static void * scr1 = malloc(scr1_size); + + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + + // wte + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); + + for (int il = 0; il < n_block; ++il) { + struct ggml_tensor * cur; + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // self-attention + { + { + cur = ggml_norm(ctx0, inpL, hparams.norm_eps); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur), + ggml_repeat(ctx0, model.blocks[il].ln_1_b, cur)); + } + + // compute QKV + { + + cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_attn_w, cur); + cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_attn_b, cur), cur); + } + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head)); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); + + // using mode = 2 for GPT-NeoX mode + Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0); + Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0); + + // store key and value to memory + { + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); + + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, + il*n_ctx*ggml_element_size(model.memory_v)*n_embd); + + // KQV = transpose(V) * KQ_soft_max + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection + { + cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_proj_w, cur); + cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_proj_b, cur), cur); + } + } + + ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + + if (hparams.par_res == 0) { + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); + + cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams); + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + } else { + struct ggml_tensor * inpFF = cur; + + // this is independent of the self-attention result, so it could be done in parallel to the self-attention + // note here we pass inpL instead of cur + cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams); + + // layer input + FF + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + inpL = ggml_add(ctx0, cur, inpL); + } + } + + ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + + // norm + { + inpL = ggml_norm(ctx0, inpL, hparams.norm_eps); + + // inpL = ln_f_g*inpL + ln_f_b + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); + } + + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + + // lm_head + { + inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); + + //inpL = ggml_add(ctx0, + // ggml_repeat(ctx0, model.lmh_b, inpL), + // inpL); + } + + // logits -> probs + //inpL = ggml_soft_max_inplace(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result for just the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + + ggml_free(ctx0); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + int64_t t_load_us = 0; + + gpt2bpe_vocab vocab; + gpt_neox_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt_neox_model_load(params.model, model, vocab)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + } + + if (params.seed < 0) { + params.seed = time(NULL); + } + + if (params.top_k == 0) { + params.top_k = model.hparams.n_vocab; + } + + printf("%s: seed = %d\n", __func__, params.seed); + printf("%s: temp = %.3f\n", __func__, params.temp); + printf("%s: top_k = %d\n", __func__, params.top_k); + printf("%s: top_p = %.3f\n", __func__, params.top_p); + printf("%s: repeat_last_n = %d\n", __func__, params.repeat_last_n); + printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty); + + std::mt19937 rng(params.seed); + + if (params.prompt.empty()) { + params.prompt = "Once upon"; + } + + std::vector last_n_tokens(model.hparams.n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); +// for (size_t i = 0; i < embd_inp.size(); i++) { +// printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str()); +// } + + if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) { + params.n_predict = model.hparams.n_ctx-embd_inp.size(); + } + + printf("%s: n_predict = %d\n", __func__, params.n_predict); + printf("\n"); + + std::vector embd; + + // determine the required inference memory per token: + size_t mem_per_token = 0; + gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { + printf("Failed to predict\n"); + return 1; + } + + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + const int top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + const int repeat_last_n = params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + + const int n_vocab = model.hparams.n_vocab; + + gpt2bpe_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (size_t k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (embd.size() > params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str() ); + } + fflush(stdout); + + // end of text token + if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx); + + return 0; +} diff --git a/examples/jeopardy/graph.py b/examples/jeopardy/graph.py index 1b6c54bff73d13096140d1de2cd46cdaefc071d5..8bc0706b86d05617e4733a8ed46b774dc186e602 100644 --- a/examples/jeopardy/graph.py +++ b/examples/jeopardy/graph.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import matplotlib.pyplot as plt import os import csv diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py index 2dccc118a70e82d57d40e0ae27606699dc385884..2a4cb65bcfc7ef0782004ddccd317026d1e4f569 100644 --- a/examples/json-schema-to-grammar.py +++ b/examples/json-schema-to-grammar.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import json import re diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e395afd05f75589e60b08133ba25ebd75a5bcdf --- /dev/null +++ b/examples/llama-bench/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-bench) +add_executable(${TARGET} llama-bench.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp new file mode 100644 index 0000000000000000000000000000000000000000..34ddfde39d29575fa581575fafbf2a2bfd90c6f6 --- /dev/null +++ b/examples/llama-bench/llama-bench.cpp @@ -0,0 +1,1013 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "llama.h" +#include "common.h" +#include "build-info.h" +#include "ggml-cuda.h" + +// utils +static uint64_t get_time_ns() { + using clock = std::chrono::high_resolution_clock; + return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); +} + +template +static std::string join(const std::vector & values, const std::string & delim) { + std::ostringstream str; + for (size_t i = 0; i < values.size(); i++) { + str << values[i]; + if (i < values.size() - 1) { + str << delim; + } + } + return str.str(); +} + +template +static std::vector split(const std::string & str, char delim) { + std::vector values; + std::istringstream str_stream(str); + std::string token; + while (std::getline(str_stream, token, delim)) { + T value; + std::istringstream token_stream(token); + token_stream >> value; + values.push_back(value); + } + return values; +} + +template +static T avg(const std::vector & v) { + if (v.empty()) { + return 0; + } + T sum = std::accumulate(v.begin(), v.end(), T(0)); + return sum / (T)v.size(); +} + +template +static T stdev(const std::vector & v) { + if (v.size() <= 1) { + return 0; + } + T mean = avg(v); + T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); + T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); + return stdev; +} + +static std::string get_cpu_info() { + std::string id; +#ifdef __linux__ + FILE * f = fopen("/proc/cpuinfo", "r"); + if (f) { + char buf[1024]; + while (fgets(buf, sizeof(buf), f)) { + if (strncmp(buf, "model name", 10) == 0) { + char * p = strchr(buf, ':'); + if (p) { + p++; + while (std::isspace(*p)) { + p++; + } + while (std::isspace(p[strlen(p) - 1])) { + p[strlen(p) - 1] = '\0'; + } + id = p; + break; + } + } + } + } +#endif + // TODO: other platforms + return id; +} + +static std::string get_gpu_info() { + std::string id; +#ifdef GGML_USE_CUBLAS + int count = ggml_cuda_get_device_count(); + for (int i = 0; i < count; i++) { + char buf[128]; + ggml_cuda_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } +#endif + // TODO: other backends + return id; +} + +// command line params +enum output_formats {CSV, JSON, MARKDOWN, SQL}; + +struct cmd_params { + std::vector model; + std::vector n_prompt; + std::vector n_gen; + std::vector n_batch; + std::vector f32_kv; + std::vector n_threads; + std::vector n_gpu_layers; + std::vector main_gpu; + std::vector mul_mat_q; + std::vector low_vram; + std::vector> tensor_split; + int reps; + bool verbose; + output_formats output_format; +}; + +static const cmd_params cmd_params_defaults = { + /* model */ {"models/7B/ggml-model-q4_0.gguf"}, + /* n_prompt */ {512}, + /* n_gen */ {128}, + /* n_batch */ {512}, + /* f32_kv */ {false}, + /* n_threads */ {get_num_physical_cores()}, + /* n_gpu_layers */ {99}, + /* main_gpu */ {0}, + /* mul_mat_q */ {true}, + /* low_vram */ {false}, + /* tensor_split */ {{}}, + /* reps */ 5, + /* verbose */ false, + /* output_format */ MARKDOWN +}; + +static void print_usage(int /* argc */, char ** argv) { + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options:\n"); + printf(" -h, --help\n"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); + printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -ngl N, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -mg i, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str()); + printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); + printf(" -ts, --tensor_split \n"); + printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); + printf(" -o, --output (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); + printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); + printf("\n"); + printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); + +} + +static cmd_params parse_cmd_params(int argc, char ** argv) { + cmd_params params; + std::string arg; + bool invalid_param = false; + const std::string arg_prefix = "--"; + const char split_delim = ','; + + params.verbose = cmd_params_defaults.verbose; + params.output_format = cmd_params_defaults.output_format; + params.reps = cmd_params_defaults.reps; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (arg == "-h" || arg == "--help") { + print_usage(argc, argv); + exit(0); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.model.insert(params.model.end(), p.begin(), p.end()); + } else if (arg == "-p" || arg == "--n-prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); + } else if (arg == "-n" || arg == "--n-gen") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); + } else if (arg == "--memory-f32") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end()); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); + } else if (arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-mg" || arg == "--main-gpu") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.main_gpu = split(argv[i], split_delim); + } else if (arg == "-lv" || arg == "--low-vram") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.low_vram.insert(params.low_vram.end(), p.begin(), p.end()); + } else if (arg == "-mmq" || arg == "--mul-mat-q") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end()); + } else if (arg == "-ts" || arg == "--tensor-split") { + if (++i >= argc) { + invalid_param = true; + break; + } + for (auto ts : split(argv[i], split_delim)) { + // split string by ; and / + const std::regex regex{R"([;/]+)"}; + std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; + std::vector split_arg{it, {}}; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + std::array tensor_split; + for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + if (i < split_arg.size()) { + tensor_split[i] = std::stof(split_arg[i]); + } else { + tensor_split[i] = 0.0f; + } + } + params.tensor_split.push_back(tensor_split); + } + } else if (arg == "-r" || arg == "--repetitions") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.reps = std::stoi(argv[i]); + } else if (arg == "-o" || arg == "--output") { + if (++i >= argc) { + invalid_param = true; + break; + } + if (argv[i] == std::string("csv")) { + params.output_format = CSV; + } else if (argv[i] == std::string("json")) { + params.output_format = JSON; + } else if (argv[i] == std::string("md")) { + params.output_format = MARKDOWN; + } else if (argv[i] == std::string("sql")) { + params.output_format = SQL; + } else { + invalid_param = true; + break; + } + } else if (arg == "-v" || arg == "--verbose") { + params.verbose = true; + } else { + invalid_param = true; + break; + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + print_usage(argc, argv); + exit(1); + } + + // set defaults + if (params.model.empty()) { params.model = cmd_params_defaults.model; } + if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } + if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } + if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } + if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; } + if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } + if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } + if (params.low_vram.empty()) { params.low_vram = cmd_params_defaults.low_vram; } + if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } + if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } + + return params; +} + +struct cmd_params_instance { + std::string model; + int n_prompt; + int n_gen; + int n_batch; + bool f32_kv; + int n_threads; + int n_gpu_layers; + int main_gpu; + bool mul_mat_q; + bool low_vram; + std::array tensor_split; + + llama_context_params to_llama_params() const { + llama_context_params lparams = llama_context_default_params(); + lparams.n_ctx = n_prompt + n_gen; + lparams.n_batch = n_batch; + lparams.f16_kv = !f32_kv; + lparams.n_gpu_layers = n_gpu_layers; + lparams.main_gpu = main_gpu; + lparams.mul_mat_q = mul_mat_q; + lparams.low_vram = low_vram; + lparams.tensor_split = tensor_split.data(); + + return lparams; + } +}; + +static std::vector get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) { + std::vector instances; + + for (const auto & m : params.model) + for (const auto & nb : params.n_batch) + for (const auto & fk : params.f32_kv) + for (const auto & nl : params.n_gpu_layers) + for (const auto & mg : params.main_gpu) + for (const auto & mmq : params.mul_mat_q) + for (const auto & lv : params.low_vram) + for (const auto & ts : params.tensor_split) + for (const auto & nt : params.n_threads) { + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ n_prompt, + /* .n_gen = */ n_gen, + /* .n_batch = */ nb, + /* .f32_kv = */ fk, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .main_gpu = */ mg, + /* .mul_mat_q = */ mmq, + /* .low_vram = */ lv, + /* .tensor_split = */ ts, + }; + instances.push_back(instance); + } + return instances; +} + +static std::vector get_cmd_params_instances(const cmd_params & params) { + std::vector instances; + + for (const auto & n_prompt : params.n_prompt) { + if (n_prompt == 0) { + continue; + } + auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt); + instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end()); + } + + for (const auto & n_gen : params.n_gen) { + if (n_gen == 0) { + continue; + } + auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0); + instances.insert(instances.end(), instances_gen.begin(), instances_gen.end()); + } + + return instances; +} + +struct test { + static const std::string build_commit; + static const int build_number; + static const bool cuda; + static const bool opencl; + static const bool metal; + static const bool gpu_blas; + static const bool blas; + static const std::string cpu_info; + static const std::string gpu_info; + std::string model_filename; + std::string model_type; + uint64_t model_size; + uint64_t model_n_params; + int n_batch; + int n_threads; + bool f32_kv; + int n_gpu_layers; + int main_gpu; + bool mul_mat_q; + bool low_vram; + std::array tensor_split; + int n_prompt; + int n_gen; + std::string test_time; + std::vector samples_ns; + + test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { + model_filename = inst.model; + char buf[128]; + llama_model_desc(lmodel, buf, sizeof(buf)); + model_type = buf; + model_size = llama_model_size(lmodel); + model_n_params = llama_model_n_params(lmodel); + n_batch = inst.n_batch; + n_threads = inst.n_threads; + f32_kv = inst.f32_kv; + n_gpu_layers = inst.n_gpu_layers; + main_gpu = inst.main_gpu; + mul_mat_q = inst.mul_mat_q; + low_vram = inst.low_vram; + tensor_split = inst.tensor_split; + n_prompt = inst.n_prompt; + n_gen = inst.n_gen; + // RFC 3339 date-time format + time_t t = time(NULL); + std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); + test_time = buf; + + (void) ctx; + } + + uint64_t avg_ns() const { + return ::avg(samples_ns); + } + + uint64_t stdev_ns() const { + return ::stdev(samples_ns); + } + + std::vector get_ts() const { + int n_tokens = n_prompt + n_gen; + std::vector ts; + std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); + return ts; + } + + double avg_ts() const { + return ::avg(get_ts()); + } + + double stdev_ts() const { + return ::stdev(get_ts()); + } + + static std::string get_backend() { + if (cuda) { + return GGML_CUDA_NAME; + } + if (opencl) { + return "OpenCL"; + } + if (metal) { + return "Metal"; + } + if (gpu_blas) { + return "GPU BLAS"; + } + if (blas) { + return "BLAS"; + } + return "CPU"; + } + + static const std::vector & get_fields() { + static const std::vector fields = { + "build_commit", "build_number", + "cuda", "opencl", "metal", "gpu_blas", "blas", + "cpu_info", "gpu_info", + "model_filename", "model_type", "model_size", "model_n_params", + "n_batch", "n_threads", "f16_kv", + "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split", + "n_prompt", "n_gen", "test_time", + "avg_ns", "stddev_ns", + "avg_ts", "stddev_ts" + }; + return fields; + } + + enum field_type {STRING, BOOL, INT, FLOAT}; + + static field_type get_field_type(const std::string & field) { + if (field == "build_number" || field == "n_batch" || field == "n_threads" || + field == "model_size" || field == "model_n_params" || + field == "n_gpu_layers" || field == "main_gpu" || + field == "n_prompt" || field == "n_gen" || + field == "avg_ns" || field == "stddev_ns") { + return INT; + } + if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || + field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") { + return BOOL; + } + if (field == "avg_ts" || field == "stddev_ts") { + return FLOAT; + } + return STRING; + } + + std::vector get_values() const { + std::string tensor_split_str; + int max_nonzero = 0; + for (int i = 0; i < LLAMA_MAX_DEVICES; i++) { + if (tensor_split[i] > 0) { + max_nonzero = i; + } + } + for (int i = 0; i <= max_nonzero; i++) { + char buf[32]; + snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]); + tensor_split_str += buf; + if (i < max_nonzero) { + tensor_split_str += "/"; + } + } + std::vector values = { + build_commit, std::to_string(build_number), + std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), + cpu_info, gpu_info, + model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), + std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), + std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str, + std::to_string(n_prompt), std::to_string(n_gen), test_time, + std::to_string(avg_ns()), std::to_string(stdev_ns()), + std::to_string(avg_ts()), std::to_string(stdev_ts()) + }; + return values; + } + + std::map get_map() const { + std::map map; + auto fields = get_fields(); + auto values = get_values(); + std::transform(fields.begin(), fields.end(), values.begin(), + std::inserter(map, map.end()), std::make_pair); + return map; + } +}; + +const std::string test::build_commit = BUILD_COMMIT; +const int test::build_number = BUILD_NUMBER; +const bool test::cuda = !!ggml_cpu_has_cublas(); +const bool test::opencl = !!ggml_cpu_has_clblast(); +const bool test::metal = !!ggml_cpu_has_metal(); +const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); +const bool test::blas = !!ggml_cpu_has_blas(); +const std::string test::cpu_info = get_cpu_info(); +const std::string test::gpu_info = get_gpu_info(); + +struct printer { + virtual ~printer() {} + + FILE * fout; + virtual void print_header(const cmd_params & params) { (void) params; }; + virtual void print_test(const test & t) = 0; + virtual void print_footer() { }; +}; + +struct csv_printer : public printer { + static std::string escape_csv(const std::string & field) { + std::string escaped = "\""; + for (auto c : field) { + if (c == '"') { + escaped += "\""; + } + escaped += c; + } + escaped += "\""; + return escaped; + } + + void print_header(const cmd_params & params) override { + std::vector fields = test::get_fields(); + fprintf(fout, "%s\n", join(fields, ",").c_str()); + (void) params; + } + + void print_test(const test & t) override { + std::vector values = t.get_values(); + std::transform(values.begin(), values.end(), values.begin(), escape_csv); + fprintf(fout, "%s\n", join(values, ",").c_str()); + } +}; + +struct json_printer : public printer { + bool first = true; + + static std::string escape_json(const std::string & value) { + std::string escaped; + for (auto c : value) { + if (c == '"') { + escaped += "\\\""; + } else if (c == '\\') { + escaped += "\\\\"; + } else if (c <= 0x1f) { + char buf[8]; + snprintf(buf, sizeof(buf), "\\u%04x", c); + escaped += buf; + } else { + escaped += c; + } + } + return escaped; + } + + static std::string format_value(const std::string & field, const std::string & value) { + switch (test::get_field_type(field)) { + case test::STRING: + return "\"" + escape_json(value) + "\""; + case test::BOOL: + return value == "0" ? "false" : "true"; + default: + return value; + } + } + + void print_header(const cmd_params & params) override { + fprintf(fout, "[\n"); + (void) params; + } + + void print_fields(const std::vector & fields, const std::vector & values) { + assert(fields.size() == values.size()); + for (size_t i = 0; i < fields.size(); i++) { + fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str()); + } + } + + void print_test(const test & t) override { + if (first) { + first = false; + } else { + fprintf(fout, ",\n"); + } + fprintf(fout, " {\n"); + print_fields(test::get_fields(), t.get_values()); + fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str()); + fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str()); + fprintf(fout, " }"); + fflush(fout); + } + + void print_footer() override { + fprintf(fout, "\n]\n"); + } +}; + +struct markdown_printer : public printer { + std::vector fields; + + static int get_field_width(const std::string & field) { + if (field == "model") { + return -30; + } + if (field == "t/s") { + return 16; + } + if (field == "size" || field == "params") { + return 10; + } + if (field == "n_gpu_layers") { + return 3; + } + + int width = std::max((int)field.length(), 10); + + if (test::get_field_type(field) == test::STRING) { + return -width; + } + return width; + } + + static std::string get_field_display_name(const std::string & field) { + if (field == "n_gpu_layers") { + return "ngl"; + } + if (field == "n_threads") { + return "threads"; + } + if (field == "mul_mat_q") { + return "mmq"; + } + if (field == "tensor_split") { + return "ts"; + } + return field; + } + + void print_header(const cmd_params & params) override { + // select fields to print + fields.push_back("model"); + fields.push_back("size"); + fields.push_back("params"); + fields.push_back("backend"); + bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; + if (!is_cpu_backend) { + fields.push_back("n_gpu_layers"); + } + if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { + fields.push_back("n_threads"); + } + if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { + fields.push_back("n_batch"); + } + if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) { + fields.push_back("f16_kv"); + } + if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { + fields.push_back("main_gpu"); + } + if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { + fields.push_back("mul_mat_q"); + } + if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) { + fields.push_back("low_vram"); + } + if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { + fields.push_back("tensor_split"); + } + fields.push_back("test"); + fields.push_back("t/s"); + + fprintf(fout, "|"); + for (const auto & field : fields) { + fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str()); + } + fprintf(fout, "\n"); + fprintf(fout, "|"); + for (const auto & field : fields) { + int width = get_field_width(field); + fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-"); + } + fprintf(fout, "\n"); + } + + void print_test(const test & t) override { + std::map vmap = t.get_map(); + + fprintf(fout, "|"); + for (const auto & field : fields) { + std::string value; + char buf[128]; + if (field == "model") { + value = t.model_type; + } else if (field == "size") { + if (t.model_size < 1024*1024*1024) { + snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); + } else { + snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); + } + value = buf; + } else if (field == "params") { + if (t.model_n_params < 1000*1000*1000) { + snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); + } else { + snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); + } + value = buf; + } else if (field == "backend") { + value = test::get_backend(); + } else if (field == "test") { + if (t.n_prompt > 0 && t.n_gen == 0) { + snprintf(buf, sizeof(buf), "pp %d", t.n_prompt); + } else if (t.n_gen > 0 && t.n_prompt == 0) { + snprintf(buf, sizeof(buf), "tg %d", t.n_gen); + } else { + assert(false); + exit(1); + } + value = buf; + } else if (field == "t/s") { + snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); + value = buf; + } else if (vmap.find(field) != vmap.end()) { + value = vmap.at(field); + } else { + assert(false); + exit(1); + } + + int width = get_field_width(field); + if (field == "t/s") { + // HACK: the utf-8 character is 2 bytes + width += 1; + } + fprintf(fout, " %*s |", width, value.c_str()); + } + fprintf(fout, "\n"); + } + + void print_footer() override { + fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number); + } +}; + +struct sql_printer : public printer { + static std::string get_sql_field_type(const std::string & field) { + switch (test::get_field_type(field)) { + case test::STRING: + return "TEXT"; + case test::BOOL: + case test::INT: + return "INTEGER"; + case test::FLOAT: + return "REAL"; + default: + assert(false); + exit(1); + } + } + + void print_header(const cmd_params & params) override { + std::vector fields = test::get_fields(); + fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n"); + for (size_t i = 0; i < fields.size(); i++) { + fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : ""); + } + fprintf(fout, ");\n"); + fprintf(fout, "\n"); + (void) params; + } + + void print_test(const test & t) override { + fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str()); + fprintf(fout, "VALUES ("); + std::vector values = t.get_values(); + for (size_t i = 0; i < values.size(); i++) { + fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : ""); + } + fprintf(fout, ");\n"); + } +}; + +static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { + std::vector tokens(n_batch, llama_token_bos(ctx)); + int n_processed = 0; + while (n_processed < n_prompt) { + int n_tokens = std::min(n_prompt - n_processed, n_batch); + llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads); + n_processed += n_tokens; + } +} + +static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { + llama_token token = llama_token_bos(ctx); + for (int i = 0; i < n_gen; i++) { + llama_eval(ctx, &token, 1, n_past + i, n_threads); + } +} + +static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) { + (void) level; + (void) text; + (void) user_data; +} + +int main(int argc, char ** argv) { + // try to set locale for unicode characters in markdown + setlocale(LC_CTYPE, ".UTF-8"); + +#if !defined(NDEBUG) + fprintf(stderr, "warning: asserts enabled, performance may be affected\n"); +#endif + +#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__)) + fprintf(stderr, "warning: debug build, performance may be affected\n"); +#endif + +#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__) + fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n"); +#endif + + cmd_params params = parse_cmd_params(argc, argv); + + // initialize llama.cpp + if (!params.verbose) { + llama_log_set(llama_null_log_callback, NULL); + } + bool numa = false; + llama_backend_init(numa); + + // initialize printer + std::unique_ptr p; + switch (params.output_format) { + case CSV: + p.reset(new csv_printer()); + break; + case JSON: + p.reset(new json_printer()); + break; + case MARKDOWN: + p.reset(new markdown_printer()); + break; + case SQL: + p.reset(new sql_printer()); + break; + default: + assert(false); + exit(1); + } + p->fout = stdout; + p->print_header(params); + + std::vector params_instances = get_cmd_params_instances(params); + + for (const auto & inst : params_instances) { + // TODO: keep the model between tests when possible + llama_context_params lparams = inst.to_llama_params(); + + llama_model * lmodel = llama_load_model_from_file(inst.model.c_str(), lparams); + if (lmodel == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); + return 1; + } + + llama_context * ctx = llama_new_context_with_model(lmodel, lparams); + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); + llama_free_model(lmodel); + return 1; + } + + test t(inst, lmodel, ctx); + + // warmup run + if (t.n_prompt > 0) { + test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads); + } + if (t.n_gen > 0) { + test_gen(ctx, 1, 0, t.n_threads); + } + + for (int i = 0; i < params.reps; i++) { + uint64_t t_start = get_time_ns(); + if (t.n_prompt > 0) { + test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); + } + if (t.n_gen > 0) { + test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); + } + uint64_t t_ns = get_time_ns() - t_start; + t.samples_ns.push_back(t_ns); + } + + p->print_test(t); + + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(lmodel); + } + + p->print_footer(); + + llama_backend_free(); + + return 0; +} diff --git a/examples/llm.vim b/examples/llm.vim index 594a285493dccb3f127f1e55fed230173d6f9a03..d580a3d00f9d664ab5a0fd431cfe3070b4a80732 100644 --- a/examples/llm.vim +++ b/examples/llm.vim @@ -8,7 +8,7 @@ function! Llm() let buffer_content = join(getline(1, '$'), "\n") " Create the JSON payload - let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false} + let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false} let json_payload.prompt = buffer_content " Define the curl command @@ -25,3 +25,4 @@ function! Llm() endfunction command! Llm call Llm() +noremap :Llm diff --git a/examples/main-cmake-pkg/.gitignore b/examples/main-cmake-pkg/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e32c11c7f4653cd6b1db1af13873d1ea50095544 --- /dev/null +++ b/examples/main-cmake-pkg/.gitignore @@ -0,0 +1,51 @@ +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +*.gguf + +*.log +.DS_Store +.build/ +.cache/ +.direnv/ +.envrc +.swiftpm +.venv +.clang-tidy +.vs/ +.vscode/ + +build*/ +out/ +tmp/ + diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..473738719197d305e376f0129a225489068a364b --- /dev/null +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.12) +project("main-cmake-pkg" C CXX) +set(TARGET main-cmake-pkg) + +find_package(Llama 0.0.1 REQUIRED) + +# Bake common functionality in with target. Because applications +# using the relocatable Llama package should be outside of the +# source tree, main-cmake-pkg pretends the dependencies are built-in. + +set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common") +add_library(common OBJECT + ${_common_path}/common.h + ${_common_path}/common.cpp + ${_common_path}/console.h + ${_common_path}/console.cpp + ${_common_path}/grammar-parser.h + ${_common_path}/grammar-parser.cpp + ) + +# WARNING: because build-info.h is auto-generated, it will only +# be available after the user has built the llama.cpp sources. +# +configure_file(${_common_path}/../build-info.h + ${CMAKE_CURRENT_BINARY_DIR}/build-info.h + COPYONLY) + +target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR} + ${CMAKE_CURRENT_BINARY_DIR}) + +add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp) +target_include_directories(${TARGET} PRIVATE ${_common_path}) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6d665f28fe9bd051cebccb22a890471e480bceb7 --- /dev/null +++ b/examples/main-cmake-pkg/README.md @@ -0,0 +1,37 @@ +# llama.cpp/example/main-cmake-pkg + +This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. + +## Building + +Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions. + +### Considerations + +When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. + +### Build llama.cpp and install to C:\LlamaCPP directory + +In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`. + +```cmd +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +mkdir build +cd build +cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64 +cmake --build . --config Release +cmake --install . --prefix C:/LlamaCPP +``` + +### Build main-cmake-pkg + + +```cmd +cd ..\examples\main-cmake-pkg +mkdir build +cd build +cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64 +cmake --build . --config Release +cmake --install . --prefix C:/MyLlamaApp +``` diff --git a/examples/main/README.md b/examples/main/README.md index 55c16096f03b793813816607c9f07dd275b2b953..26e1e28dd08c179a8726ee8eff8b8d61ac56af8d 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -34,7 +34,7 @@ For an interactive experience, try this command: #### Unix-based systems (Linux, macOS, etc.): ```bash -./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \ +./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ 'User: Hi AI: Hello. I am an AI chatbot. Would you like to talk? User: Sure! @@ -45,7 +45,7 @@ User:' #### Windows: ```powershell -main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" +main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" ``` The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): @@ -144,7 +144,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. -- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. +- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. ### Keep Prompt @@ -160,9 +160,13 @@ The following options allow you to control the text generation process and fine- ### Number of Tokens to Predict -- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). +- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled) -The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit. +The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. + +A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output. + +If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled. It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. @@ -270,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models. ### NUMA support -- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root. +- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root. ### Memory Float 32 @@ -284,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models. - `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. +### Grammars + +- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax. + ### Quantization For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run). @@ -294,7 +302,6 @@ These options provide extra functionality and customization when running the LLa - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `--verbose-prompt`: Print the prompt before generating text. -- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly. - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 56ada7e69d99dd2c5656dfd3752078547875906f..d78112260de08ef5d5cd613114120b057c3bd5f0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,9 +1,5 @@ -// Defines sigaction on msys: -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "common.h" + #include "console.h" #include "llama.h" #include "build-info.h" @@ -17,6 +13,7 @@ #include #include #include +#include #include #include @@ -36,18 +33,69 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static llama_context ** g_ctx; +static llama_context ** g_ctx; +static llama_model ** g_model; +static gpt_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; static bool is_interacting = false; + +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const std::vector & input_tokens, const std::string & output, + const std::vector & output_tokens +) { + if (params.logdir.empty()) { + return; + } + + const std::string timestamp = get_sortable_timestamp(); + + const bool success = create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: main\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Generation Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + dump_string_yaml_multiline(logfile, "output", output.c_str()); + dump_vector_int_yaml(logfile, "output_tokens", output_tokens); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) -void sigint_handler(int signo) { +static void sigint_handler(int signo) { if (signo == SIGINT) { if (!is_interacting) { - is_interacting=true; + is_interacting = true; } else { console::cleanup(); printf("\n"); llama_print_timings(*g_ctx); + write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); } } @@ -56,11 +104,21 @@ void sigint_handler(int signo) { int main(int argc, char ** argv) { gpt_params params; + g_params = ¶ms; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("main", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + + // TODO: Dump params ? + //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); + // save choice to use color for later // (note for later: this is a slightly awkward choice) console::init(params.simple_io, params.use_color); @@ -83,42 +141,38 @@ int main(int argc, char ** argv) { } if (params.rope_freq_base != 10000.0) { - fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base); + LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 1.0) { - fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); + LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); } - if (params.n_ctx > 2048) { - // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048 - fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx); - } else if (params.n_ctx < 8) { - fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); - params.n_ctx = 8; - } - - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + LOG_TEE("%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.random_prompt) { params.prompt = gpt_random_prompt(rng); } + LOG("%s: llama backend init\n", __func__); llama_backend_init(params.numa); llama_model * model; llama_context * ctx; llama_context * ctx_guidance = NULL; + g_model = &model; g_ctx = &ctx; // load the model and apply lora adapter, if any + LOG("%s: load the model and apply lora adapter, if any\n", __func__); std::tie(model, ctx) = llama_init_from_gpt_params(params); if (params.cfg_scale > 1.f) { struct llama_context_params lparams = llama_context_params_from_gpt_params(params); @@ -126,34 +180,26 @@ int main(int argc, char ** argv) { } if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_TEE("%s: error: unable to load model\n", __func__); return 1; } + const int n_ctx_train = llama_n_ctx_train(ctx); + if (params.n_ctx > n_ctx_train) { + LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } else if (params.n_ctx < 8) { + LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + LOG_TEE("\n"); + LOG_TEE("system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } - // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters - // uncomment the "used_mem" line in llama.cpp to see the results - if (params.mem_test) { - { - fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx); - - const std::vector tmp(params.n_batch, llama_token_bos()); - llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads); - } - - llama_print_timings(ctx); - llama_free(ctx); - llama_free_model(model); - - return 0; - } - // export the cgraph and exit if (params.export_cgraph) { llama_eval_export(ctx, "llama.ggml"); @@ -167,7 +213,7 @@ int main(int argc, char ** argv) { std::vector session_tokens; if (!path_session.empty()) { - fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); // fopen to check for existing session FILE * fp = std::fopen(path_session.c_str(), "rb"); @@ -177,53 +223,70 @@ int main(int argc, char ** argv) { session_tokens.resize(params.n_ctx); size_t n_token_count_out = 0; if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { - fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); return 1; } session_tokens.resize(n_token_count_out); llama_set_rng_seed(ctx, params.seed); - fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); + LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); } else { - fprintf(stderr, "%s: session file does not exist, will create\n", __func__); + LOG_TEE("%s: session file does not exist, will create\n", __func__); } } - // tokenize the prompt - std::vector embd_inp; + const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + LOG("add_bos: %d\n", add_bos); - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); + std::vector embd_inp; if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { - embd_inp = ::llama_tokenize(ctx, params.prompt, true); + LOG("tokenize the prompt\n"); + embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos); } else { + LOG("use session tokens\n"); embd_inp = session_tokens; } + LOG("prompt: \"%s\"\n", log_tostr(params.prompt)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp)); + + // Should not run without any tokens + if (embd_inp.empty()) { + embd_inp.push_back(llama_token_bos(ctx)); + LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp)); + } + // Tokenize negative prompt std::vector guidance_inp; int guidance_offset = 0; int original_prompt_len = 0; if (ctx_guidance) { - params.cfg_negative_prompt.insert(0, 1, ' '); - guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true); + LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt)); + + guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); + LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp)); + + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); + LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp)); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true); original_prompt_len = original_inp.size(); guidance_offset = (int)guidance_inp.size() - original_prompt_len; + LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); + LOG("guidance_offset: %s", log_tostr(guidance_offset)); } const int n_ctx = llama_n_ctx(ctx); + LOG("n_ctx: %d\n", n_ctx); if ((int) embd_inp.size() > n_ctx - 4) { - fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } // debug message about similarity of saved session, if applicable size_t n_matching_session_tokens = 0; - if (session_tokens.size()) { + if (!session_tokens.empty()) { for (llama_token id : session_tokens) { if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { break; @@ -231,22 +294,27 @@ int main(int argc, char ** argv) { n_matching_session_tokens++; } if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { - fprintf(stderr, "%s: using full prompt from session file\n", __func__); + LOG_TEE("%s: using full prompt from session file\n", __func__); } else if (n_matching_session_tokens >= embd_inp.size()) { - fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); + LOG_TEE("%s: session file has exact match for prompt!\n", __func__); } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", __func__, n_matching_session_tokens, embd_inp.size()); } else { - fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n", + LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", __func__, n_matching_session_tokens, embd_inp.size()); } } + LOGLN( + "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu", + log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); + // if we will use the cache for the full prompt without reaching the end of the cache, force // reevaluation of the last token token to recalculate the cached logits - if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && - session_tokens.size() > embd_inp.size()) { + if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { + LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); + session_tokens.resize(embd_inp.size() - 1); } @@ -256,8 +324,11 @@ int main(int argc, char ** argv) { } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); + + LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx)); + LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx)); // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { @@ -270,34 +341,31 @@ int main(int argc, char ** argv) { params.interactive = true; } - // determine newline token - auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); - if (params.verbose_prompt) { - fprintf(stderr, "\n"); - fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_TEE("\n"); + LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (ctx_guidance) { - fprintf(stderr, "\n"); - fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); - fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); + LOG_TEE("\n"); + LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); + LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i])); + LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); } } if (params.n_keep > 0) { - fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); + LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i])); + LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - fprintf(stderr, "'\n"); + LOG_TEE("'\n"); } - fprintf(stderr, "\n"); + LOG_TEE("\n"); } if (params.interactive) { @@ -311,51 +379,51 @@ int main(int argc, char ** argv) { auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; }; - SetConsoleCtrlHandler(static_cast(console_ctrl_handler), true); + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - fprintf(stderr, "%s: interactive mode on.\n", __func__); + LOG_TEE("%s: interactive mode on.\n", __func__); - if (params.antiprompt.size()) { - for (auto antiprompt : params.antiprompt) { - fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); + if (!params.antiprompt.empty()) { + for (const auto & antiprompt : params.antiprompt) { + LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); } } if (params.input_prefix_bos) { - fprintf(stderr, "Input prefix with BOS\n"); + LOG_TEE("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); } if (!params.input_suffix.empty()) { - fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", + LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); - fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); - fprintf(stderr, "\n\n"); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_TEE("\n\n"); + struct llama_grammar * grammar = NULL; grammar_parser::parse_state parsed_grammar; - llama_grammar * grammar = NULL; + if (!params.grammar.empty()) { parsed_grammar = grammar_parser::parse(params.grammar.c_str()); // will be empty (default) if there are parse errors if (parsed_grammar.rules.empty()) { return 1; } - fprintf(stderr, "%s: grammar:\n", __func__); + LOG_TEE("%s: grammar:\n", __func__); grammar_parser::print_grammar(stderr, parsed_grammar); - fprintf(stderr, "\n"); + LOG_TEE("\n"); { - auto it = params.logit_bias.find(llama_token_eos()); + auto it = params.logit_bias.find(llama_token_eos(ctx)); if (it != params.logit_bias.end() && it->second == -INFINITY) { - fprintf(stderr, - "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); + LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); } } @@ -365,8 +433,8 @@ int main(int argc, char ** argv) { } // TODO: replace with ring-buffer - std::vector last_n_tokens(n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + std::vector last_tokens(n_ctx); + std::fill(last_tokens.begin(), last_tokens.end(), 0); if (params.interactive) { const char *control_message; @@ -378,11 +446,11 @@ int main(int argc, char ** argv) { " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - fprintf(stderr, "== Running in interactive mode. ==\n" + LOG_TEE("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - " - Press Ctrl+C to interject at any time.\n" + LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); #endif - "%s\n", control_message); + LOG_TEE( "%s\n", control_message); is_interacting = params.interactive_first; } @@ -397,33 +465,37 @@ int main(int argc, char ** argv) { int n_session_consumed = 0; int n_past_guidance = 0; + std::vector input_tokens; g_input_tokens = &input_tokens; + std::vector output_tokens; g_output_tokens = &output_tokens; + std::ostringstream output_ss; g_output_ss = &output_ss; + // the first thing we will do is to output the prompt, so set color accordingly console::set_display(console::prompt); std::vector embd; std::vector embd_guidance; - // do one empty run to warm up the model - { - const std::vector tmp = { llama_token_bos(), }; - llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); - llama_reset_timings(ctx); - } + const int n_vocab = llama_n_vocab(ctx); + + std::vector candidates; + candidates.reserve(n_vocab); while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict - if (embd.size() > 0) { + if (!embd.empty()) { // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via // --prompt or --file which uses the same value. - auto max_embd_size = n_ctx - 4; + int max_embd_size = n_ctx - 4; + // Ensure the input doesn't exceed the context size by truncating embd if necessary. - if ((int)embd.size() > max_embd_size) { - auto skipped_tokens = embd.size() - max_embd_size; + if ((int) embd.size() > max_embd_size) { + const int skipped_tokens = (int) embd.size() - max_embd_size; + embd.resize(max_embd_size); + console::set_display(console::error); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console::set_display(console::reset); fflush(stdout); - embd.resize(max_embd_size); } // infinite text generation via context swapping @@ -431,25 +503,27 @@ int main(int argc, char ** argv) { // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + if (params.n_predict == -2) { + LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + const int n_left = n_past - params.n_keep; + LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep); // always keep the first token - BOS - n_past = std::max(1, params.n_keep); + n_past = std::max(1, params.n_keep); n_past_guidance = std::max(1, params.n_keep + guidance_offset); - // insert n_left/2 tokens at the start of embd from last_n_tokens - embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); + LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); - // stop saving session if we run out of context - path_session.clear(); + // insert n_left/2 tokens at the start of embd from last_tokens + embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size()); - //printf("\n---\n"); - //printf("resetting: '"); - //for (int i = 0; i < (int) embd.size(); i++) { - // printf("%s", llama_token_to_str(ctx, embd[i])); - //} - //printf("'\n"); - //printf("\n---\n"); + LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); + + LOG("clear session path\n"); + path_session.clear(); } // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) @@ -479,7 +553,7 @@ int main(int argc, char ** argv) { if (ctx_guidance) { int input_size = 0; - llama_token* input_buf = NULL; + llama_token * input_buf = NULL; if (n_past_guidance < (int) guidance_inp.size()) { // Guidance context should have the same data with these modifications: @@ -495,22 +569,19 @@ int main(int argc, char ** argv) { ); } - input_buf = embd_guidance.data(); + input_buf = embd_guidance.data(); input_size = embd_guidance.size(); - //fprintf(stderr, "\n---------------------\n"); - //for (int i = 0; i < (int) embd_guidance.size(); i++) { - //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i])); - //} - //fprintf(stderr, "\n---------------------\n"); + + LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance)); } else { - input_buf = embd.data(); + input_buf = embd.data(); input_size = embd.size(); } for (int i = 0; i < input_size; i += params.n_batch) { int n_eval = std::min(input_size - i, params.n_batch); if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_TEE("%s : failed to eval\n", __func__); return 1; } @@ -523,14 +594,20 @@ int main(int argc, char ** argv) { if (n_eval > params.n_batch) { n_eval = params.n_batch; } + + LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd)); + if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_TEE("%s : failed to eval\n", __func__); return 1; } + n_past += n_eval; + + LOG("n_past = %d\n", n_past); } - if (embd.size() > 0 && !path_session.empty()) { + if (!embd.empty() && !path_session.empty()) { session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); n_session_consumed = session_tokens.size(); } @@ -540,101 +617,21 @@ int main(int argc, char ** argv) { embd_guidance.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - // out of user input, sample next token - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - // optionally save the session on first sample (for faster prompt loading next time) if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { need_to_save_session = false; llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - } - - llama_token id = 0; - - { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); - - // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - if (ctx_guidance) { - llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale); - } - // Apply penalties - float nl_logit = logits[llama_token_nl()]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); - llama_sample_repetition_penalty(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) { - logits[llama_token_nl()] = nl_logit; - } + LOG("saved session to %s\n", path_session.c_str()); + } - if (grammar != NULL) { - llama_sample_grammar(ctx, &candidates_p, grammar); - } + const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates); - if (temp <= 0) { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } else { - if (mirostat == 1) { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } else if (mirostat == 2) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - // printf("`%d`", candidates_p.size); + last_tokens.erase(last_tokens.begin()); + last_tokens.push_back(id); - if (grammar != NULL) { - llama_grammar_accept_token(ctx, grammar, id); - } + LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens)); - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - } - - // add it to the context embd.push_back(id); // echo this to console @@ -642,12 +639,15 @@ int main(int argc, char ** argv) { // decrement remaining sampling budget --n_remain; + + LOG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing + LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[n_consumed]); + last_tokens.erase(last_tokens.begin()); + last_tokens.push_back(embd_inp[n_consumed]); ++n_consumed; if ((int) embd.size() >= params.n_batch) { break; @@ -658,23 +658,30 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - printf("%s", llama_token_to_str(ctx, id)); + const std::string token_str = llama_token_to_piece(ctx, id); + printf("%s", token_str.c_str()); + + if (embd.size() > 1) { + input_tokens.push_back(id); + } else { + output_tokens.push_back(id); + output_ss << token_str; + } } fflush(stdout); } // reset color to default if we there is no pending user input - if (input_echo && (int)embd_inp.size() == n_consumed) { + if (input_echo && (int) embd_inp.size() == n_consumed) { console::set_display(console::reset); } // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { - // check for reverse prompt - if (params.antiprompt.size()) { + if (!params.antiprompt.empty()) { std::string last_output; - for (auto id : last_n_tokens) { - last_output += llama_token_to_str(ctx, id); + for (auto id : last_tokens) { + last_output += llama_token_to_piece(ctx, id); } is_antiprompt = false; @@ -687,7 +694,7 @@ int main(int argc, char ** argv) { ? last_output.length() - static_cast(antiprompt.length() + extra_padding) : 0; - if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) { + if (last_output.find(antiprompt, search_start_pos) != std::string::npos) { if (params.interactive) { is_interacting = true; console::set_display(console::user_input); @@ -697,12 +704,18 @@ int main(int argc, char ** argv) { break; } } + + if (is_antiprompt) { + LOG("found antiprompt: %s\n", last_output.c_str()); + } } // deal with end of text token in interactive mode - if (last_n_tokens.back() == llama_token_eos()) { + if (last_tokens.back() == llama_token_eos(ctx)) { + LOG("found EOS token\n"); + if (params.interactive) { - if (params.antiprompt.size() != 0) { + if (!params.antiprompt.empty()) { // tokenize and inject first reverse prompt const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); @@ -719,16 +732,20 @@ int main(int argc, char ** argv) { } if (n_past > 0 && is_interacting) { + LOG("waiting for user input\n"); + if (params.instruct) { printf("\n> "); } if (params.input_prefix_bos) { - embd_inp.push_back(llama_token_bos()); + LOG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_token_bos(ctx)); } std::string buffer; if (!params.input_prefix.empty()) { + LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); buffer += params.input_prefix; printf("%s", buffer.c_str()); } @@ -748,25 +765,43 @@ int main(int argc, char ** argv) { if (buffer.length() > 1) { // append input suffix if any if (!params.input_suffix.empty()) { + LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); buffer += params.input_suffix; printf("%s", params.input_suffix.c_str()); } + LOG("buffer: '%s'\n", buffer.c_str()); + + const size_t original_size = embd_inp.size(); + // instruct mode: insert instruction prefix if (params.instruct && !is_antiprompt) { + LOG("inserting instruction prefix\n"); n_consumed = embd_inp.size(); embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); } - auto line_inp = ::llama_tokenize(ctx, buffer, false); + const auto line_inp = ::llama_tokenize(ctx, buffer, false); + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp)); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); // instruct mode: insert response suffix if (params.instruct) { + LOG("inserting instruction suffix\n"); embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); } + for (size_t i = original_size; i < embd_inp.size(); ++i) { + const llama_token token = embd_inp[i]; + output_tokens.push_back(token); + output_ss << llama_token_to_piece(ctx, token); + } + n_remain -= line_inp.size(); + LOG("n_remain: %d\n", n_remain); + } else { + LOG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -778,8 +813,7 @@ int main(int argc, char ** argv) { if (grammar != NULL) { llama_grammar_free(grammar); - std::vector grammar_rules( - parsed_grammar.c_rules()); + std::vector grammar_rules(parsed_grammar.c_rules()); grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); @@ -790,24 +824,27 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) { - fprintf(stderr, " [end of text]\n"); + if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) { + LOG_TEE(" [end of text]\n"); break; } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. - if (params.interactive && n_remain <= 0 && params.n_predict != -1) { + // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). + if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { n_remain = params.n_predict; is_interacting = true; } } if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { - fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } llama_print_timings(ctx); + write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); + if (ctx_guidance) { llama_free(ctx_guidance); } llama_free(ctx); llama_free_model(model); @@ -817,5 +854,9 @@ int main(int argc, char ** argv) { } llama_backend_free(); +#ifndef LOG_DISABLE_LOGS + LOG_TEE("Log end\n") +#endif // LOG_DISABLE_LOGS + return 0; } diff --git a/examples/make-ggml.py b/examples/make-ggml.py index f63d9fc22fb3fe41dff6f16ee85fec3a4ae91318..6a34eeac53faaa149d288582435d3b82d3df7280 100644 --- a/examples/make-ggml.py +++ b/examples/make-ggml.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ This script converts Hugging Face llama models to GGML and quantizes them. diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index 7438defdefcdfeada492d9ae758695c914ea9b6b..c05a4fa933d316733a1c76e7e53fbefe66e5d172 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -2,7 +2,7 @@ // // - First, export a LLaMA graph: // -// $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export +// $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export // // - Run this tool to evaluate the exported graph: // diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 62433e983df9c4cb770ff36516685a9a3d55610a..2b375e34e7234685f70240a4c889dfc9dcf6cad7 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,16 +1,83 @@ +#include "build-info.h" #include "common.h" #include "llama.h" -#include "build-info.h" #include +#include +#include #include #include +#include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif -std::vector softmax(const std::vector& logits) { +struct results_perplexity { + std::vector tokens; + double ppl_value; + std::vector logits; + std::vector probs; +}; + +struct results_log_softmax { + double log_softmax; + float logit; + float prob; +}; + +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const struct results_perplexity & results +) { + if (params.logdir.empty()) { + return; + } + + if (params.hellaswag) { + fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); + return; + } + + const std::string timestamp = get_sortable_timestamp(); + + const bool success = create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: main\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Perplexity Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + dump_vector_float_yaml(logfile, "logits", results.logits); + fprintf(logfile, "ppl_value: %f\n", results.ppl_value); + dump_vector_float_yaml(logfile, "probs", results.probs); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + +static std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; for (float v : logits) max_logit = std::max(max_logit, v); @@ -26,12 +93,208 @@ std::vector softmax(const std::vector& logits) { return probs; } -void perplexity(llama_context * ctx, const gpt_params & params) { +static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { + float max_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit); + return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; +} + +static void process_logits( + int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, + double & nll, double & nll2, float * logit_history, float * prob_history +) { + std::mutex mutex; + int counter = 0; + auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { + double local_nll = 0, local_nll2 = 0; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + nll += local_nll; nll2 += local_nll2; + break; + } + lock.unlock(); + const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const double v = -results.log_softmax; + local_nll += v; + local_nll2 += v*v; + + logit_history[i] = results.logit; + prob_history[i] = results.prob; + } + }; + for (auto & w : workers) w = std::thread(compute); + compute(); + for (auto & w : workers) w.join(); + +} + +static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { + // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research + // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` + // Output: `perplexity: 13.5106 [114/114]` + // BOS tokens will be added for each chunk before eval + + const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + const bool add_bos = is_spm; + + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + if (int(tokens.size()) < 2*params.n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx, + params.n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + return {std::move(tokens), 0., {}, {}}; + } + + std::vector logit_history; + std::vector prob_history; + + logit_history.resize(tokens.size()); + prob_history.resize(tokens.size()); + + if (params.ppl_stride <= 0) { + fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); + return {tokens, -1, logit_history, prob_history}; + } + + const int calc_chunk = params.n_ctx; + + fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); + + if (int(tokens.size()) <= calc_chunk) { + fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, + tokens.size(), params.n_ctx, params.ppl_stride); + return {tokens, -1, logit_history, prob_history}; + } + + const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; + + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_n_vocab(ctx); + const int n_batch = params.n_batch; + + int count = 0; + double nll = 0.0; + + fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + + for (int i = 0; i < n_chunk; ++i) { + const int start = i * params.ppl_stride; + const int end = start + calc_chunk; + + const int num_batches = (calc_chunk + n_batch - 1) / n_batch; + //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); + + std::vector logits; + + const auto t_start = std::chrono::high_resolution_clock::now(); + + for (int j = 0; j < num_batches; ++j) { + const int batch_start = start + j * n_batch; + const int batch_size = std::min(end - batch_start, n_batch); + + //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); + if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { + //fprintf(stderr, "%s : failed to eval\n", __func__); + return {tokens, -1, logit_history, prob_history}; + } + + // save original token and restore it after eval + const auto token_org = tokens[batch_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[batch_start] = llama_token_bos(ctx); + } + + const auto batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + + if (j == 0) { + tokens[batch_start] = token_org; + } + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + + if (i == 0) { + const float t_total = std::chrono::duration(t_end - t_start).count(); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + int total_seconds = (int)(t_total * n_chunk); + if (total_seconds >= 60*60) { + fprintf(stderr, "%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + } + + //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); + for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) { + + // Calculate probability of next token, given the previous ones. + const std::vector tok_logits( + logits.begin() + (j + 0) * n_vocab, + logits.begin() + (j + 1) * n_vocab); + + const float prob = softmax(tok_logits)[tokens[start + j + 1]]; + logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]]; + prob_history[start + j + 1] = prob; + + nll += -std::log(prob); + ++count; + } + // perplexity is e^(average negative log-likelihood) + if (params.ppl_output_type == 0) { + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + } else { + printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); + } + fflush(stdout); + } + printf("\n"); + + return {tokens, std::exp(nll / count), logit_history, prob_history}; +} + +static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { + if (params.ppl_stride > 0) { + return perplexity_v2(ctx, params); + } + // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - auto tokens = ::llama_tokenize(ctx, params.prompt, true); + + const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + const bool add_bos = is_spm; + + auto tim1 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + auto tim2 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + + if (int(tokens.size()) < 2*params.n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx, + params.n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + return {std::move(tokens), 0., {}, {}}; + } + + std::vector logit_history; + logit_history.resize(tokens.size()); + + std::vector prob_history; + prob_history.resize(tokens.size()); const int n_chunk_max = tokens.size() / params.n_ctx; @@ -41,9 +304,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) { int count = 0; double nll = 0.0; + double nll2 = 0.0; fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + std::vector workers(std::thread::hardware_concurrency() - 1); + for (int i = 0; i < n_chunk; ++i) { const int start = i * params.n_ctx; const int end = start + params.n_ctx; @@ -62,13 +328,13 @@ void perplexity(llama_context * ctx, const gpt_params & params) { const auto token_org = tokens[batch_start]; // add BOS token for the first batch of each chunk - if (j == 0) { - tokens[batch_start] = llama_token_bos(); + if (add_bos && j == 0) { + tokens[batch_start] = llama_token_bos(ctx); } if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { fprintf(stderr, "%s : failed to eval\n", __func__); - return; + return {tokens, -1, logit_history, prob_history}; } // restore the original token in case it was set to BOS @@ -88,7 +354,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%d minutes\n", total_seconds / 60); + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } // We get the logits for all the tokens in the context window (params.n_ctx) @@ -103,25 +369,61 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // Example, we have a context window of 512, we will compute perplexity for each of the // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. - for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. - const std::vector tok_logits( - logits.begin() + (j + 0) * n_vocab, - logits.begin() + (j + 1) * n_vocab); + const int first = params.n_ctx/2; + process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + count += params.n_ctx - first - 1; - const float prob = softmax(tok_logits)[tokens[start + j + 1]]; - - nll += -std::log(prob); - ++count; - } // perplexity is e^(average negative log-likelihood) - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + if (params.ppl_output_type == 0) { + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + } else { + double av = nll/count; + double av2 = nll2/count - av*av; + if (av2 > 0) av2 = sqrt(av2/(count-1)); + printf("%8d %.4lf %4lf %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2); + } fflush(stdout); } printf("\n"); + + nll2 /= count; + nll /= count; + const double ppl = exp(nll); + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + printf("Unexpected negative standard deviation of log(prob)\n"); + } + + return {tokens, ppl, logit_history, prob_history}; } -void hellaswag_score(llama_context * ctx, const gpt_params & params) { +static std::vector hellaswag_evaluate_tokens( + llama_context * ctx, const std::vector& tokens, int n_past, int n_batch, int n_vocab, int n_thread +) { + std::vector result; + result.reserve(tokens.size() * n_vocab); + size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch; + for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) { + size_t n_tokens = tokens.size() - i_chunk * n_batch; + n_tokens = std::min(n_tokens, size_t(n_batch)); + if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return {}; + } + + const auto logits = llama_get_logits(ctx); + result.insert(result.end(), logits, logits + n_tokens * n_vocab); + + n_past += n_tokens; + } + return result; +} + +static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -155,8 +457,11 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t hs_task_count = prompt_lines.size()/6; fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); + const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + fprintf(stderr, "================================= is_spm = %d\n", is_spm); + // This is needed as usual for LLaMA models - bool prepend_bos = true; + const bool add_bos = is_spm; // Number of tasks to use when computing the score if ( params.hellaswag_tasks < hs_task_count ) { @@ -194,7 +499,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_data[i].context = prompt_lines[idx*6]; hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j=0; j < 4; j++) { - hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j]; + hs_data[i].ending[j] = prompt_lines[idx*6+2+j]; } // Delete the selected random example from the prompt @@ -209,50 +514,105 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { double acc = 0.0f; const int n_vocab = llama_n_vocab(ctx); - for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { + std::vector> ending_tokens(4); + std::vector tok_logits(n_vocab); + + for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { // Tokenize the context to count tokens - std::vector context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos); + std::vector context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos); size_t context_size = context_embd.size(); - for (size_t ending_idx=0;ending_idx<4;ending_idx++) { + for (int i = 0; i < 4; ++i) { + ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos); + for (int k = 0; k < int(context_size); ++k) { + if (ending_tokens[i][k] != context_embd[k]) { + fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k); + break; + } + } + } + + // Do the 1st ending + // In this case we include the context when evaluating + //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos); + auto query_embd = ending_tokens[0]; + auto query_size = query_embd.size(); + + // Stop if query wont fit the ctx window + if (query_size > (size_t)params.n_ctx) { + fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size); + return; + } + + // Speedup small evaluations by evaluating atleast 32 tokens + if (query_size < 32) { + query_embd.resize(32); + } + + auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads); + if (logits.empty()) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + + std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float)); + const auto first_probs = softmax(tok_logits); + + hs_data[task_idx].ending_logprob_count[0] = 1; + hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]); + + // Calculate the logprobs over the ending + for (size_t j = context_size; j < query_size - 1; j++) { + + std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float)); + + const float prob = softmax(tok_logits)[query_embd[j + 1]]; + + hs_data[task_idx].ending_logprob[0] += std::log(prob); + hs_data[task_idx].ending_logprob_count[0]++; + } + + // Calculate the mean token logprob for acc_norm + hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0]; + + // Do the remaining endings + // For these, we use the bare ending with n_past = context_size + // + for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) { // Tokenize the query - std::vector query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos); - size_t query_size = query_embd.size(); + query_embd.resize(ending_tokens[ending_idx].size() - context_size); + std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int)); + query_size = query_embd.size(); // Stop if query wont fit the ctx window - if (query_size > (size_t)params.n_ctx) { + if (context_size + query_size > (size_t)params.n_ctx) { fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size); return; } // Speedup small evaluations by evaluating atleast 32 tokens - if (query_size < 32) { - query_embd.resize(32); - } + // No, resizing to 32 is actually slightly slower (at least on CUDA) + //if (query_size < 32) { + // query_embd.resize(32); + //} // Evaluate the query - if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) { + logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads); + if (logits.empty()) { fprintf(stderr, "%s : failed to eval\n", __func__); return; } - const auto query_logits = llama_get_logits(ctx); - std::vector logits; - logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab); - - hs_data[task_idx].ending_logprob_count[ending_idx] = 0; - hs_data[task_idx].ending_logprob[ending_idx] = 0.0f; + hs_data[task_idx].ending_logprob_count[ending_idx] = 1; + hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]); // Calculate the logprobs over the ending - for (size_t j = context_size-1; j < query_size - 1; j++) { - // Calculate probability of next token, given the previous ones. - const std::vector tok_logits( - logits.begin() + (j + 0) * n_vocab, - logits.begin() + (j + 1) * n_vocab); + for (size_t j = 0; j < query_size - 1; j++) { + std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float)); - const float prob = softmax(tok_logits)[query_embd[ j + 1]]; + const float prob = softmax(tok_logits)[query_embd[j + 1]]; hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob); hs_data[task_idx].ending_logprob_count[ending_idx]++; @@ -267,9 +627,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { } // Find the ending with maximum logprob - size_t ending_logprob_max_idx = -1; - double ending_logprob_max_val = -INFINITY; - for (size_t j=0; j < 4; j++) { + size_t ending_logprob_max_idx = 0; + double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0]; + for (size_t j = 1; j < 4; j++) { if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) { ending_logprob_max_idx = j; ending_logprob_max_val = hs_data[task_idx].ending_logprob[j]; @@ -297,19 +657,20 @@ int main(int argc, char ** argv) { gpt_params params; params.n_batch = 512; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } params.perplexity = true; params.n_batch = std::min(params.n_batch, params.n_ctx); - if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" - "expect poor results\n", __func__, params.n_ctx); + if (params.ppl_stride > 0) { + fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", + params.n_ctx, params.n_ctx + params.ppl_stride/2); + params.n_ctx += params.ppl_stride/2; } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); @@ -334,6 +695,12 @@ int main(int argc, char ** argv) { return 1; } + const int n_ctx_train = llama_n_ctx_train(ctx); + if (params.n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } + // print system information { fprintf(stderr, "\n"); @@ -341,13 +708,16 @@ int main(int argc, char ** argv) { params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } + struct results_perplexity results; if (params.hellaswag) { hellaswag_score(ctx, params); } else { - perplexity(ctx, params); + results = perplexity(ctx, params); } llama_print_timings(ctx); + write_logfile(ctx, params, model, results); + llama_free(ctx); llama_free_model(model); diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index c5c394058ced81642e6a503f86a87b448f88d474..db182e2633f1fb7e1e9075f9d6427309d683004b 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -2,4 +2,5 @@ set(TARGET quantize-stats) add_executable(${TARGET} quantize-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6aa06ec8fa1152dc3474c071ac3657b031dbe08c..94edb94d958d06d14e18436423a977f04b86b49f 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,7 +1,7 @@ -#include "ggml.h" -#include "build-info.h" - #define LLAMA_API_INTERNAL +#include "build-info.h" +#include "common.h" +#include "ggml.h" #include "llama.h" #include @@ -24,7 +24,7 @@ #endif struct quantize_stats_params { - std::string model = "models/7B/ggml-model-f16.bin"; + std::string model = "models/7B/ggml-model-f16.gguf"; bool verbose = false; bool per_layer_stats = false; bool print_histogram = false; @@ -34,8 +34,8 @@ struct quantize_stats_params { std::vector include_types; }; -const size_t HISTOGRAM_BUCKETS = 150; -const double HISTOGRAM_RANGE = 0.03; +constexpr size_t HISTOGRAM_BUCKETS = 150; +constexpr double HISTOGRAM_RANGE = 0.03; struct error_stats { size_t num_samples; @@ -44,8 +44,7 @@ struct error_stats { uint64_t error_histogram[HISTOGRAM_BUCKETS]; }; - -void quantize_stats_print_usage(int /*argc*/, char ** argv) { +static void quantize_stats_print_usage(int /*argc*/, char ** argv) { quantize_stats_params params; fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); @@ -71,7 +70,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { } // Check if a layer is included/excluded by command line -bool layer_included(const quantize_stats_params params, const std::string & layer) { +static bool layer_included(const quantize_stats_params & params, const std::string & layer) { for (const auto& excluded : params.exclude_layers) { if (std::regex_search(layer, std::regex(excluded))) { return false; @@ -86,7 +85,7 @@ bool layer_included(const quantize_stats_params params, const std::string & laye } // Update error statistics given vectors with the before/after result of quantization -void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { +static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { for (int64_t i = 0; i < nelements; i++) { double diff = input[i] - output[i]; stats.total_error += diff * diff; @@ -96,14 +95,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou stats.num_samples += nelements; } -void combine_error_stats(error_stats & into, const error_stats & from) { +static void combine_error_stats(error_stats & into, const error_stats & from) { into.num_samples += from.num_samples; into.total_error += from.total_error; if (from.max_error > into.max_error) into.max_error = from.max_error; for (size_t i=0; inb[3] == tensor->nb[2]*tensor->ne[2]; } -void test_roundtrip_on_chunk( - const ggml_tensor * layer, - int64_t offset, - int64_t chunk_size, - const ggml_type_traits_t & qfns, - bool use_reference, - float * input_scratch, - char * quantized_scratch, - float * output_scratch, - error_stats & stats) { - +static void test_roundtrip_on_chunk( + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference, + float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats +) { if (layer->type == GGML_TYPE_F16) { for (int i = 0; i < chunk_size; i++) { input_scratch[i] = ggml_get_f32_1d(layer, i + offset); @@ -174,18 +166,11 @@ void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats -void test_roundtrip_on_layer( - std::string & name, - bool print_layer_stats, - const ggml_type_traits_t & qfns, - bool use_reference, - const ggml_tensor * layer, - std::vector & input_scratch, - std::vector & quantized_scratch, - std::vector & output_scratch, - error_stats & total_error, - int max_thread = 0) { - +static void test_roundtrip_on_layer( + std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference, + const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, + std::vector & output_scratch, error_stats & total_error, int max_thread = 0 +) { assert(tensor_is_contiguous(layer)); error_stats layer_error {}; uint64_t nelements = ggml_nelements(layer); @@ -314,7 +299,7 @@ int main(int argc, char ** argv) { return 1; } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); // load the model fprintf(stderr, "Loading model\n"); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 47d0be72ecc0fa15c353af8af0fbf1d81fa65fa3..4a8eed544cb04a57111f9d9d86c5b6874f17b384 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -2,6 +2,7 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 0d5b906fb77b523d88c93c56a567e8015ae12827..1c1d957e63d5b33430ef327ba388d7478071ecfa 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,3 +1,5 @@ +#include "build-info.h" +#include "common.h" #include "llama.h" #include @@ -12,31 +14,33 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", }, - { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", }, - { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, #ifdef GGML_USE_K_QUANTS - { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", }, - { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, - { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", }, - { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, - { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", }, - { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", }, #endif - { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; -bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { +static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; for (auto ch : ftype_str_in) { @@ -66,15 +70,20 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: } // usage: -// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] +// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // -void usage(const char * executable) { - fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable); - fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - fprintf(stderr, "\nAllowed quantization types:\n"); +static void usage(const char * executable) { + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { - printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); + if (it.name != "COPY") { + printf(" %2d or ", it.ftype); + } else { + printf(" "); + } + printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str()); } exit(1); } @@ -98,7 +107,7 @@ int main(int argc, char ** argv) { } } - if (argc - arg_idx < 3) { + if (argc - arg_idx < 2) { usage(argv[0]); } @@ -112,13 +121,16 @@ int main(int argc, char ** argv) { std::string ftype_str; if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { std::string fpath; - const size_t pos = fname_inp.find_last_of('/'); + const size_t pos = fname_inp.find_last_of("/\\"); if (pos != std::string::npos) { fpath = fname_inp.substr(0, pos + 1); } - // export as [inp path]/ggml-model-[ftype].bin - fname_out = fpath + "ggml-model-" + ftype_str + ".bin"; + // export as [inp path]/ggml-model-[ftype].gguf + fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; arg_idx++; + if (ftype_str == "COPY") { + params.only_copy = true; + } } else { fname_out = argv[arg_idx]; @@ -132,6 +144,9 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); return 1; } + if (ftype_str == "COPY") { + params.only_copy = true; + } arg_idx++; } @@ -146,6 +161,8 @@ int main(int argc, char ** argv) { } } + print_build_info(); + fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); if (params.nthread > 0) { fprintf(stderr, " using %d threads", params.nthread); diff --git a/examples/reason-act.sh b/examples/reason-act.sh index e7fe655dbcea3a724f575abc16a8ef6de8197b5f..046c48db584bc31f9c5fd57f84c21d84932a3511 100644 --- a/examples/reason-act.sh +++ b/examples/reason-act.sh @@ -1,4 +1,3 @@ - #!/bin/bash cd `dirname $0` diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 61c71c3589fdf28869adbad4fa22e05c56fb110e..95527bb863524f80e58fccff86c967a70b89995b 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,6 +1,6 @@ +#include "build-info.h" #include "common.h" #include "llama.h" -#include "build-info.h" #include #include @@ -13,11 +13,11 @@ int main(int argc, char ** argv) { params.repeat_last_n = 64; params.prompt = "The quick brown fox"; - if (gpt_params_parse(argc, argv, params) == false) { + if (!gpt_params_parse(argc, argv, params)) { return 1; } - fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + print_build_info(); if (params.n_predict < 0) { params.n_predict = 16; @@ -26,7 +26,6 @@ int main(int argc, char ** argv) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; - lparams.n_gqa = params.n_gqa; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.use_mmap = params.use_mmap; @@ -45,9 +44,8 @@ int main(int argc, char ** argv) { llama_free_model(model); return 1; } - auto tokens = std::vector(params.n_ctx); - auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); - + auto tokens = llama_tokenize(ctx, params.prompt, true); + auto n_prompt_tokens = tokens.size(); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); llama_free(ctx); @@ -89,10 +87,10 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx, &candidates_p); - auto next_token_str = llama_token_to_str(ctx, next_token); + auto next_token_str = llama_token_to_piece(ctx, next_token); last_n_tokens_data.push_back(next_token); - printf("%s", next_token_str); + printf("%s", next_token_str.c_str()); if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_free(ctx); @@ -149,10 +147,10 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx2, &candidates_p); - auto next_token_str = llama_token_to_str(ctx2, next_token); + auto next_token_str = llama_token_to_piece(ctx2, next_token); last_n_tokens_data.push_back(next_token); - printf("%s", next_token_str); + printf("%s", next_token_str.c_str()); if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_free(ctx2); diff --git a/examples/server/README.md b/examples/server/README.md index e56ca063a9f0e66e3d44e2fd91bfa63b66e525ae..5176080463839618087ff6dd90bfd6ef7d6ba907 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to Command line options: - `--threads N`, `-t N`: Set the number of threads to use during computation. -- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. @@ -16,6 +16,7 @@ Command line options: - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. +- `--numa`: Attempt optimizations that help on some NUMA systems. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. @@ -47,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./server -m models/7B/ggml-model.bin -c 2048 +./server -m models/7B/ggml-model.gguf -c 2048 ``` ### Windows: ```powershell -server.exe -m models\7B\ggml-model.bin -c 2048 +server.exe -m models\7B\ggml-model.gguf -c 2048 ``` - The above command will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. @@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed. ```bash mkdir llama-client cd llama-client -npm init -npm install axios ``` Create a index.js file and put inside this: ```javascript -const axios = require("axios"); - const prompt = `Building a website can be done in 10 simple steps:`; async function Test() { - let result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - n_predict: 512, - }); - - // the response is received until completion finish - console.log(result.data.content); + let response = await fetch("http://127.0.0.1:8080/completion", { + method: 'POST', + body: JSON.stringify({ + prompt, + n_predict: 512, + }) + }) + console.log((await response.json()).content) } -Test(); +Test() ``` And run it: ```bash -node . +node index.js ``` ## API Endpoints @@ -126,7 +123,7 @@ node . `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. - `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does. + `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). @@ -167,6 +164,12 @@ node . Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`. +- **POST** `/detokenize`: Convert tokens to text. + + *Options:* + + `tokens`: Set the tokens to detokenize. + - **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. *Options:* diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py index aa325a03ee444c9d141b1d35d4c44ce3979336ad..ed19237b0b3e5bb82aae4b45513ae7392f320fd3 100644 --- a/examples/server/api_like_OAI.py +++ b/examples/server/api_like_OAI.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse from flask import Flask, jsonify, request, Response import urllib.parse diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs index 8269e2592733b55a901ee461d9065c39a794cf21..87f4d2926ca8752614074f48068a2cada29a949b 100644 --- a/examples/server/chat.mjs +++ b/examples/server/chat.mjs @@ -1,5 +1,34 @@ import * as readline from 'node:readline' import { stdin, stdout } from 'node:process' +import { readFileSync } from 'node:fs' +import { SchemaConverter } from './public/json-schema-to-grammar.mjs' + +const args = process.argv.slice(2); +const grammarJsonSchemaFile = args.find( + (_, index) => args[index - 1] === "--grammar-json-schema" +); +const grammarFile = args.find((_, index) => args[index - 1] === "--grammar"); + +// Example usage: function,arguments +const grammarJsonSchemaPropOrder = args.find( + (_, index) => args[index - 1] === "--grammar-json-schema-prop-order" +); +const propOrder = grammarJsonSchemaPropOrder + ? grammarJsonSchemaPropOrder + .split(",") + .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {}) + : {}; + +let grammar = null +if (grammarJsonSchemaFile) { + const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8')) + const converter = new SchemaConverter(propOrder) + converter.visit(schema, '') + grammar = converter.formatGrammar() +} +if (grammarFile) { + grammar = readFileSync(grammarFile, 'utf-8') +} const API_URL = 'http://127.0.0.1:8080' @@ -48,6 +77,7 @@ async function chat_completion(question) { n_keep: n_keep, n_predict: 256, stop: ["\n### Human:"], // stop completion after generating this + grammar, stream: true, }) }) diff --git a/examples/server/deps.sh b/examples/server/deps.sh index 1e9fe964b961a3249c279ab36a2e4d583da1c320..ea23e64500b09b7535725fe5bb9574a33c729192 100644 --- a/examples/server/deps.sh +++ b/examples/server/deps.sh @@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline FILES=$(ls $PUBLIC) +cd $PUBLIC for FILE in $FILES; do - func=$(echo $FILE | tr '.' '_') - echo "generate $FILE.hpp ($func)" - xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp + echo "generate $FILE.hpp" + + # use simple flag for old version of xxd + xxd -i $FILE > $DIR/$FILE.hpp done diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp index bd0b4787068e096e46db5322f431a3e037abf23a..f302329299f43445ef3dc1ecab64ed02a8e44489 100644 --- a/examples/server/index.html.hpp +++ b/examples/server/index.html.hpp @@ -152,519 +152,1178 @@ unsigned char index_html[] = { 0x6f, 0x70, 0x65, 0x6e, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, - 0x65, 0x61, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, - 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x35, 0x70, 0x78, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, - 0x67, 0x72, 0x6f, 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x31, 0x30, - 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x70, 0x72, 0x65, 0x20, 0x63, 0x6f, 0x64, 0x65, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, - 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x2d, 0x73, + 0x65, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, + 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x33, 0x65, + 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, + 0x64, 0x65, 0x72, 0x2d, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x3a, 0x20, + 0x31, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, 0x63, + 0x63, 0x63, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x2d, + 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, + 0x3a, 0x20, 0x61, 0x62, 0x73, 0x6f, 0x6c, 0x75, 0x74, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, - 0x23, 0x32, 0x32, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x64, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, - 0x20, 0x6d, 0x6f, 0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, - 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, - 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, - 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, - 0x20, 0x33, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, - 0x74, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, - 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, - 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, - 0x2c, 0x20, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2d, 0x61, 0x6c, - 0x69, 0x67, 0x6e, 0x3a, 0x20, 0x63, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x3b, + 0x77, 0x68, 0x69, 0x74, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, + 0x32, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, + 0x6f, 0x78, 0x2d, 0x73, 0x68, 0x61, 0x64, 0x6f, 0x77, 0x3a, 0x20, 0x30, + 0x20, 0x30, 0x20, 0x31, 0x30, 0x70, 0x78, 0x20, 0x72, 0x67, 0x62, 0x61, + 0x28, 0x30, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x30, 0x2e, + 0x31, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, + 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x35, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, 0x72, 0x6f, + 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, - 0x3a, 0x20, 0x38, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x38, 0x38, 0x38, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x3c, 0x2f, - 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x3c, 0x73, - 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, - 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x20, 0x68, 0x2c, - 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x65, 0x66, 0x66, - 0x65, 0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, - 0x64, 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x20, 0x75, - 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x75, 0x73, - 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x75, 0x73, 0x65, - 0x52, 0x65, 0x66, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x66, 0x72, - 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x2e, 0x6a, - 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, - 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, - 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, - 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x73, 0x69, - 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x22, 0x54, 0x68, - 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x76, - 0x65, 0x72, 0x73, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x65, 0x74, - 0x77, 0x65, 0x65, 0x6e, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x61, 0x6e, - 0x64, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x61, 0x20, 0x66, - 0x72, 0x69, 0x65, 0x6e, 0x64, 0x6c, 0x79, 0x20, 0x63, 0x68, 0x61, 0x74, - 0x62, 0x6f, 0x74, 0x2e, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x64, - 0x20, 0x69, 0x6e, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x6d, - 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x2e, 0x22, 0x2c, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, - 0x7d, 0x7d, 0x5c, 0x6e, 0x5c, 0x6e, 0x7b, 0x7b, 0x68, 0x69, 0x73, 0x74, - 0x6f, 0x72, 0x79, 0x7d, 0x7d, 0x5c, 0x6e, 0x7b, 0x7b, 0x63, 0x68, 0x61, - 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x6e, 0x61, 0x6d, - 0x65, 0x7d, 0x7d, 0x3a, 0x20, 0x7b, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x7d, 0x7d, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3a, - 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x79, 0x70, 0x65, 0x3a, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x2c, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x72, 0x3a, - 0x20, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x22, 0x2c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x3a, 0x20, 0x22, 0x55, - 0x73, 0x65, 0x72, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, - 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x3a, 0x20, 0x34, - 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, - 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a, 0x20, 0x30, - 0x2e, 0x37, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x3a, - 0x20, 0x32, 0x35, 0x36, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x20, 0x3d, - 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x20, 0x70, 0x65, 0x6e, - 0x61, 0x6c, 0x74, 0x79, 0x2c, 0x20, 0x2d, 0x31, 0x20, 0x3d, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, - 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x31, 0x2e, - 0x31, 0x38, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, - 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x3a, 0x20, 0x34, - 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x3c, 0x3d, 0x20, 0x30, 0x20, 0x74, - 0x6f, 0x20, 0x75, 0x73, 0x65, 0x20, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x20, - 0x73, 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x6f, 0x70, 0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x2c, 0x20, 0x2f, - 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, - 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x66, 0x73, 0x5f, 0x7a, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, - 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, - 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x3a, 0x20, 0x31, 0x2e, + 0x70, 0x72, 0x65, 0x20, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, + 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, + 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x32, 0x32, + 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c, + 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x64, 0x65, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, + 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, 0x20, 0x6d, 0x6f, + 0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, + 0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, 0x65, 0x6d, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, + 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x33, 0x70, + 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x35, + 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c, 0x6f, + 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2c, 0x20, 0x66, + 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x6e, + 0x3a, 0x20, 0x63, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6f, + 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x38, + 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x38, 0x38, 0x38, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x40, + 0x6b, 0x65, 0x79, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x20, 0x6c, 0x6f, + 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x67, 0x2d, 0x77, 0x69, 0x70, + 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x25, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, + 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x70, 0x6f, + 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x30, 0x25, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x31, 0x30, 0x30, 0x25, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, + 0x75, 0x6e, 0x64, 0x2d, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, + 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, + 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, + 0x31, 0x3a, 0x20, 0x23, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65, 0x30, 0x30, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, + 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, + 0x32, 0x3a, 0x20, 0x23, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65, 0x66, 0x66, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, + 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, + 0x20, 0x35, 0x30, 0x25, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, + 0x75, 0x6e, 0x64, 0x2d, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x3a, 0x20, 0x6c, + 0x69, 0x6e, 0x65, 0x61, 0x72, 0x2d, 0x67, 0x72, 0x61, 0x64, 0x69, 0x65, + 0x6e, 0x74, 0x28, 0x39, 0x30, 0x64, 0x65, 0x67, 0x2c, 0x20, 0x76, 0x61, + 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, + 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x29, 0x2c, 0x20, 0x76, 0x61, + 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, + 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x32, 0x29, 0x2c, 0x20, 0x76, 0x61, + 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, + 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x29, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, + 0x6f, 0x6e, 0x3a, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, + 0x62, 0x67, 0x2d, 0x77, 0x69, 0x70, 0x65, 0x20, 0x32, 0x73, 0x20, 0x6c, + 0x69, 0x6e, 0x65, 0x61, 0x72, 0x20, 0x69, 0x6e, 0x66, 0x69, 0x6e, 0x69, + 0x74, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x40, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x20, 0x28, 0x70, + 0x72, 0x65, 0x66, 0x65, 0x72, 0x73, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, + 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x65, 0x3a, 0x20, 0x64, 0x61, 0x72, + 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, + 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x3a, + 0x20, 0x23, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x30, 0x30, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, + 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, + 0x32, 0x3a, 0x20, 0x23, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x66, 0x66, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, + 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, + 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, + 0x20, 0x62, 0x6c, 0x61, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, 0x20, + 0x3c, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, + 0x3d, 0x22, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x20, + 0x68, 0x2c, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, 0x65, + 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, + 0x74, 0x65, 0x64, 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, + 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x20, + 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x75, + 0x73, 0x65, 0x52, 0x65, 0x66, 0x2c, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, + 0x6e, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x66, + 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x2e, + 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, + 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, + 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, + 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, + 0x20, 0x7b, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x43, 0x6f, 0x6e, + 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, + 0x6d, 0x20, 0x27, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x2d, 0x73, 0x63, 0x68, + 0x65, 0x6d, 0x61, 0x2d, 0x74, 0x6f, 0x2d, 0x67, 0x72, 0x61, 0x6d, 0x6d, + 0x61, 0x72, 0x2e, 0x6d, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, + 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, + 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x22, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, + 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x73, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, + 0x20, 0x55, 0x73, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x4c, 0x6c, + 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x61, 0x20, 0x66, 0x72, 0x69, 0x65, 0x6e, + 0x64, 0x6c, 0x79, 0x20, 0x63, 0x68, 0x61, 0x74, 0x62, 0x6f, 0x74, 0x2e, + 0x20, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x69, 0x73, 0x20, 0x68, 0x65, + 0x6c, 0x70, 0x66, 0x75, 0x6c, 0x2c, 0x20, 0x6b, 0x69, 0x6e, 0x64, 0x2c, + 0x20, 0x68, 0x6f, 0x6e, 0x65, 0x73, 0x74, 0x2c, 0x20, 0x67, 0x6f, 0x6f, + 0x64, 0x20, 0x61, 0x74, 0x20, 0x77, 0x72, 0x69, 0x74, 0x69, 0x6e, 0x67, + 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x6e, 0x65, 0x76, 0x65, 0x72, 0x20, + 0x66, 0x61, 0x69, 0x6c, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x61, 0x6e, 0x73, + 0x77, 0x65, 0x72, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x72, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x73, 0x20, 0x69, 0x6d, 0x6d, 0x65, 0x64, 0x69, 0x61, + 0x74, 0x65, 0x6c, 0x79, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, + 0x68, 0x20, 0x70, 0x72, 0x65, 0x63, 0x69, 0x73, 0x69, 0x6f, 0x6e, 0x2e, + 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x70, 0x72, + 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x7d, 0x5c, 0x6e, 0x5c, 0x6e, 0x7b, 0x7b, + 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x7d, 0x7d, 0x5c, 0x6e, 0x7b, + 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x2c, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, + 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, + 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x7d, 0x3a, 0x20, 0x7b, 0x7b, 0x6d, + 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x7d, 0x22, 0x2c, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, + 0x69, 0x70, 0x74, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x22, 0x63, 0x68, + 0x61, 0x74, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x68, 0x61, 0x72, 0x3a, 0x20, 0x22, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x22, + 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, + 0x3a, 0x20, 0x22, 0x55, 0x73, 0x65, 0x72, 0x22, 0x2c, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, + 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, + 0x74, 0x3a, 0x20, 0x34, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, + 0x65, 0x3a, 0x20, 0x30, 0x2e, 0x37, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, + 0x74, 0x5f, 0x6e, 0x3a, 0x20, 0x32, 0x35, 0x36, 0x2c, 0x20, 0x2f, 0x2f, + 0x20, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, + 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x2c, 0x20, 0x2d, 0x31, + 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x73, + 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, + 0x3a, 0x20, 0x31, 0x2e, 0x31, 0x38, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, + 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, + 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, + 0x6b, 0x3a, 0x20, 0x34, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x3c, 0x3d, + 0x20, 0x30, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x73, 0x65, 0x20, 0x76, 0x6f, + 0x63, 0x61, 0x62, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e, + 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, + 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, - 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, - 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, - 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, - 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x3a, 0x20, - 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2f, 0x31, 0x2f, 0x32, 0x0a, + 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, + 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, + 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, + 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, + 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, + 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, + 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, + 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, + 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x3a, 0x20, 0x35, 0x2c, 0x20, 0x2f, - 0x2f, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x65, 0x6e, 0x74, - 0x72, 0x6f, 0x70, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, - 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x3a, - 0x20, 0x30, 0x2e, 0x31, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6c, 0x65, 0x61, - 0x72, 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x61, 0x74, 0x65, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, - 0x61, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, - 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, - 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, - 0x74, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, - 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x63, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x20, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, - 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x3d, - 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, - 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, - 0x72, 0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, - 0x3e, 0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, - 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, - 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, - 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, - 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, - 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x74, - 0x72, 0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x74, - 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x78, 0x74, - 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, - 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, - 0x2e, 0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x2c, 0x20, - 0x2e, 0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, - 0x73, 0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x41, 0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, 0x5c, 0x7b, 0x28, 0x2e, 0x2a, - 0x3f, 0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, 0x67, 0x2c, 0x20, 0x28, 0x5f, - 0x2c, 0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x74, 0x74, 0x69, - 0x6e, 0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, 0x5d, 0x29, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, - 0x2f, 0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, - 0x68, 0x61, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, - 0x28, 0x6d, 0x73, 0x67, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, - 0x28, 0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, - 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, - 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, - 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, - 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, - 0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, - 0x72, 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x75, 0x73, - 0x65, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x5d, - 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, - 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, + 0x61, 0x74, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2f, + 0x31, 0x2f, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, + 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x3a, 0x20, + 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, + 0x20, 0x65, 0x6e, 0x74, 0x72, 0x6f, 0x70, 0x79, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, + 0x65, 0x74, 0x61, 0x3a, 0x20, 0x30, 0x2e, 0x31, 0x2c, 0x20, 0x2f, 0x2f, + 0x20, 0x6c, 0x65, 0x61, 0x72, 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x61, + 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, 0x61, + 0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20, 0x27, 0x27, 0x2c, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x3a, + 0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x63, 0x6f, + 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, + 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2a, 0x20, 0x53, 0x54, 0x41, 0x52, 0x54, 0x3a, 0x20, 0x53, 0x75, 0x70, + 0x70, 0x6f, 0x72, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, 0x74, 0x6f, + 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, + 0x20, 0x69, 0x6e, 0x20, 0x62, 0x6f, 0x72, 0x77, 0x73, 0x65, 0x72, 0x20, + 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, + 0x20, 0x2a, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, + 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, + 0x4b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, + 0x63, 0x70, 0x70, 0x5f, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x6c, + 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, + 0x22, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, + 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, + 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, + 0x28, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, + 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, + 0x73, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, + 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, + 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, + 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x4a, 0x53, + 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, + 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, + 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, + 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x52, 0x61, 0x77, + 0x54, 0x65, 0x78, 0x74, 0x28, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, + 0x61, 0x67, 0x65, 0x2e, 0x73, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, + 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, + 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, + 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, + 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, + 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, + 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, + 0x74, 0x28, 0x74, 0x61, 0x67, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, + 0x6d, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, + 0x72, 0x61, 0x67, 0x65, 0x2e, 0x67, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, + 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, + 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, + 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, + 0x67, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, + 0x20, 0x28, 0x21, 0x69, 0x74, 0x65, 0x6d, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, + 0x28, 0x69, 0x74, 0x65, 0x6d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, + 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x52, + 0x61, 0x77, 0x54, 0x65, 0x78, 0x74, 0x28, 0x74, 0x61, 0x67, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, + 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x67, 0x65, + 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, + 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, + 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, + 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x69, 0x74, 0x65, 0x6d, + 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, + 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, + 0x72, 0x65, 0x61, 0x74, 0x65, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x75, 0x73, + 0x65, 0x72, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, + 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, + 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, + 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, + 0x28, 0x7b, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x27, 0x27, 0x2c, + 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x7b, + 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x7b, 0x7d, + 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x3a, 0x20, 0x7b, 0x7d, + 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x69, 0x6d, 0x70, 0x6f, + 0x72, 0x74, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x73, + 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, + 0x6e, 0x67, 0x73, 0x20, 0x69, 0x66, 0x20, 0x74, 0x68, 0x65, 0x72, 0x65, + 0x20, 0x61, 0x72, 0x65, 0x20, 0x61, 0x6e, 0x79, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, + 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, + 0x73, 0x74, 0x6f, 0x72, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x20, 0x6f, 0x6e, + 0x65, 0x20, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x20, 0x69, 0x6e, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, + 0x6f, 0x66, 0x20, 0x7b, 0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x64, 0x61, 0x74, 0x61, 0x22, 0x20, + 0x7d, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x7b, 0x20, 0x22, 0x73, 0x65, 0x74, + 0x74, 0x69, 0x6e, 0x67, 0x73, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x73, 0x65, 0x74, 0x74, + 0x69, 0x6e, 0x67, 0x73, 0x64, 0x61, 0x74, 0x61, 0x22, 0x20, 0x7d, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, + 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x49, 0x6d, 0x70, 0x6f, 0x72, 0x74, + 0x69, 0x6e, 0x67, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, 0x29, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, + 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, + 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, + 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, + 0x41, 0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, + 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, + 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, + 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x77, 0x65, + 0x72, 0x65, 0x20, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x66, 0x75, + 0x6c, 0x79, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x2e, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x50, 0x72, 0x6f, + 0x63, 0x65, 0x73, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x73, 0x61, 0x76, 0x65, + 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, + 0x61, 0x6e, 0x64, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x69, 0x6e, 0x67, + 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, + 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, + 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, + 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x69, 0x6d, + 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x73, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x6f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, 0x65, 0x20, 0x64, + 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x61, + 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x64, + 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x73, + 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, + 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, + 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, + 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x73, 0x27, 0x2c, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, + 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x73, 0x61, 0x76, + 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, + 0x20, 0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, + 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x49, 0x6e, 0x69, 0x74, 0x69, + 0x61, 0x6c, 0x69, 0x7a, 0x69, 0x6e, 0x67, 0x20, 0x4c, 0x6f, 0x63, 0x61, + 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0x73, 0x61, 0x76, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, + 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x61, + 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, + 0x20, 0x7b, 0x20, 0x22, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x22, + 0x3a, 0x20, 0x7b, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x3a, + 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x3a, 0x20, + 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x20, 0x7d, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, + 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, + 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, + 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, + 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, 0x2c, + 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, + 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, + 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, + 0x27, 0x52, 0x65, 0x73, 0x65, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x64, + 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, + 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, + 0x20, 0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, + 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x64, 0x61, 0x74, + 0x61, 0x20, 0x3d, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, + 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x5b, 0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x27, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x74, 0x29, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, + 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x2e, + 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x74, + 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, + 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, + 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, + 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x29, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, + 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, + 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, + 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, + 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x67, + 0x65, 0x74, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, + 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, + 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, + 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, + 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, + 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, + 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, + 0x61, 0x73, 0x74, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x69, 0x66, 0x20, 0x28, 0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, 0x65, + 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x29, 0x20, 0x7b, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x41, + 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, + 0x20, 0x72, 0x65, 0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x29, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, + 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x20, 0x3d, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, 0x65, 0x64, 0x54, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, + 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, + 0x67, 0x28, 0x27, 0x4e, 0x6f, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, + 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, 0x20, 0x75, 0x73, 0x69, 0x6e, + 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x61, + 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, + 0x74, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x20, 0x77, 0x61, 0x73, 0x20, 0x66, 0x6f, 0x75, 0x6e, + 0x64, 0x2c, 0x20, 0x73, 0x6f, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x66, + 0x72, 0x6f, 0x6d, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x2e, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, + 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, + 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, + 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, + 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x41, 0x70, 0x70, 0x6c, 0x79, + 0x69, 0x6e, 0x67, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, + 0x61, 0x6e, 0x64, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x69, + 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x20, 0x64, 0x61, 0x74, 0x61, + 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, + 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, + 0x70, 0x70, 0x6c, 0x79, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, + 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, + 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x73, 0x61, + 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, + 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, + 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x28, 0x29, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, + 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x54, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x20, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, + 0x65, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, + 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, + 0x20, 0x3d, 0x3d, 0x20, 0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, + 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x20, 0x77, 0x65, 0x20, 0x64, 0x6f, 0x6e, 0x27, 0x74, + 0x20, 0x77, 0x61, 0x6e, 0x74, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x61, 0x76, + 0x65, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, + 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, + 0x20, 0x73, 0x6f, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x63, 0x72, + 0x65, 0x61, 0x74, 0x65, 0x20, 0x61, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x6f, + 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, + 0x65, 0x74, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x55, 0x73, + 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2d, 0x27, + 0x20, 0x2b, 0x20, 0x44, 0x61, 0x74, 0x65, 0x2e, 0x6e, 0x6f, 0x77, 0x28, + 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, + 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x27, 0x6e, 0x61, 0x6d, 0x65, 0x27, 0x3a, + 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x27, 0x64, 0x61, 0x74, 0x61, 0x27, + 0x3a, 0x20, 0x7b, 0x20, 0x27, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, + 0x27, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x27, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x27, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, + 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x53, 0x61, 0x76, 0x69, 0x6e, + 0x67, 0x20, 0x61, 0x73, 0x20, 0x27, 0x20, 0x2b, 0x20, 0x6e, 0x65, 0x77, + 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, + 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, + 0x65, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x20, 0x73, + 0x6c, 0x6f, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, + 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, + 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, + 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, + 0x6c, 0x61, 0x73, 0x74, 0x27, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x61, 0x6e, 0x64, 0x20, + 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x69, 0x74, 0x20, 0x62, 0x61, 0x63, 0x6b, + 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x79, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x41, + 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, 0x73, + 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, + 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, + 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, + 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x27, 0x2c, + 0x20, 0x7b, 0x20, 0x27, 0x6e, 0x61, 0x6d, 0x65, 0x27, 0x3a, 0x20, 0x73, + 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x27, 0x64, 0x61, 0x74, + 0x61, 0x27, 0x3a, 0x20, 0x7b, 0x20, 0x27, 0x73, 0x65, 0x73, 0x73, 0x69, + 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x27, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x27, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, + 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x43, 0x68, 0x65, 0x63, + 0x6b, 0x69, 0x6e, 0x67, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x75, 0x74, + 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, + 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, + 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, + 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, + 0x73, 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2a, 0x20, 0x45, 0x4e, 0x44, 0x3a, 0x20, 0x53, 0x75, 0x70, + 0x70, 0x6f, 0x72, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, 0x74, 0x6f, + 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, + 0x20, 0x69, 0x6e, 0x20, 0x62, 0x72, 0x6f, 0x77, 0x73, 0x65, 0x72, 0x73, + 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, + 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, + 0x74, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, + 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, + 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, + 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x6c, 0x79, 0x20, + 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x61, + 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x3f, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, + 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, + 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, + 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x21, 0x3d, 0x20, 0x6e, + 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, + 0x20, 0x68, 0x61, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x75, 0x73, 0x65, + 0x72, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x61, 0x20, + 0x63, 0x68, 0x61, 0x74, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, + 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, + 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2c, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, - 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, - 0x70, 0x74, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x28, - 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, - 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, - 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x7d, 0x29, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x22, - 0x5c, 0x6e, 0x22, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, - 0x65, 0x74, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73, 0x65, - 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, - 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, - 0x3e, 0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x28, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, - 0x29, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, - 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, - 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, - 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, - 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, - 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, - 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3a, - 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63, 0x68, 0x75, - 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, - 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x64, - 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, - 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x20, 0x6c, 0x65, 0x61, 0x64, - 0x69, 0x6e, 0x67, 0x20, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, - 0x63, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x20, 0x3d, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, - 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, - 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, 0x2b, 0x2f, 0x2c, 0x20, 0x22, - 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, + 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x72, 0x61, + 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, + 0x65, 0x20, 0x3d, 0x20, 0x28, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, + 0x69, 0x70, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, + 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, + 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x72, 0x65, 0x70, + 0x6c, 0x61, 0x63, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, + 0x3d, 0x20, 0x28, 0x73, 0x74, 0x72, 0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, + 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, + 0x74, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, + 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, + 0x20, 0x28, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, + 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, + 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, + 0x6e, 0x67, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, + 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, + 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, + 0x70, 0x6c, 0x61, 0x63, 0x65, 0x41, 0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, + 0x5c, 0x7b, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, + 0x67, 0x2c, 0x20, 0x28, 0x5f, 0x2c, 0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, + 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, + 0x5d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, + 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x73, + 0x65, 0x72, 0x76, 0x65, 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x20, 0x3d, 0x20, 0x61, + 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x6d, 0x73, 0x67, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, + 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, + 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, + 0x64, 0x79, 0x20, 0x72, 0x75, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, + 0x2e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, + 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, + 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, - 0x74, 0x6f, 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x63, 0x68, - 0x61, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, - 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x5d, 0x5d, 0x29, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, - 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x66, 0x69, 0x6e, 0x69, 0x73, 0x68, 0x65, 0x64, 0x3a, 0x20, 0x27, - 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x2c, 0x20, 0x22, 0x27, 0x2c, 0x20, 0x73, - 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x64, - 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, - 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, - 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, - 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, 0x6e, 0x70, - 0x75, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, - 0x61, 0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74, 0x6f, 0x70, - 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, - 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, - 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, - 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, + 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, + 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, + 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, + 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, + 0x6d, 0x73, 0x67, 0x5d, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x6d, + 0x70, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x3a, 0x20, 0x6d, 0x73, 0x67, + 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, + 0x73, 0x74, 0x6f, 0x72, 0x79, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, + 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, + 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, 0x66, 0x6c, 0x61, 0x74, + 0x4d, 0x61, 0x70, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x28, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x64, + 0x61, 0x74, 0x61, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, + 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, + 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, + 0x73, 0x73, 0x61, 0x67, 0x65, 0x3a, 0x20, 0x41, 0x72, 0x72, 0x61, 0x79, + 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28, 0x64, 0x61, 0x74, + 0x61, 0x29, 0x20, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, + 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, + 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, + 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, + 0x73, 0x2f, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x20, 0x3a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2c, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x22, 0x5c, 0x6e, 0x22, 0x29, + 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, + 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68, + 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, + 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, + 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, + 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, + 0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, + 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, + 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x22, + 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x5d, + 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, + 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, + 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, + 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61, + 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, + 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3a, 0x20, + 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, + 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, + 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, + 0x65, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, + 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, 0x20, 0x26, 0x26, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, + 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, + 0x73, 0x5b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, + 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, + 0x20, 0x2d, 0x20, 0x31, 0x5d, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x28, 0x2f, 0x5c, 0x6e, 0x24, + 0x2f, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, + 0x61, 0x67, 0x65, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, + 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, + 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, + 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, + 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, + 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x65, 0x74, 0x20, - 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, - 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x22, 0x43, + 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x69, + 0x6e, 0x69, 0x73, 0x68, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x22, 0x2c, 0x20, + 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, + 0x67, 0x65, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, + 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, + 0x2c, 0x20, 0x22, 0x27, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, + 0x79, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, + 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, + 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x70, 0x75, 0x73, 0x68, + 0x28, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, + 0x2e, 0x2e, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x2c, 0x20, + 0x5b, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x7d, 0x22, 0x2c, + 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, + 0x61, 0x67, 0x65, 0x73, 0x5d, 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, + 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, + 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, 0x61, + 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x20, 0x3d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, + 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, + 0x6e, 0x70, 0x75, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, + 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, + 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74, + 0x6f, 0x70, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, + 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, + 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, + 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x68, 0x61, 0x74, 0x28, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x22, - 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, - 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x68, - 0x69, 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x33, 0x20, 0x26, - 0x26, 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x68, 0x69, - 0x66, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, - 0x74, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, - 0x6d, 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x3d, 0x24, - 0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, - 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, - 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x32, 0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, - 0x70, 0x72, 0x65, 0x73, 0x73, 0x3d, 0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, - 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x73, 0x7d, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, - 0x3d, 0x24, 0x7b, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3d, 0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, - 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, 0x61, 0x79, 0x20, 0x73, - 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x22, - 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, - 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, 0x67, 0x68, 0x74, 0x22, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, - 0x70, 0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x22, 0x20, - 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x21, + 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, + 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, + 0x6d, 0x69, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x74, 0x6f, 0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x74, 0x28, 0x6d, 0x65, 0x73, + 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, + 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, + 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, + 0x74, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, + 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, + 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x33, + 0x20, 0x26, 0x26, 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x73, + 0x68, 0x69, 0x66, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62, + 0x6d, 0x69, 0x74, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, + 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, + 0x6f, 0x72, 0x6d, 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, + 0x3d, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, + 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, + 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x4e, 0x61, + 0x6d, 0x65, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, + 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, + 0x22, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x22, 0x20, 0x3a, 0x20, + 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6e, 0x69, + 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x28, 0x65, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, 0x70, 0x72, 0x65, 0x73, 0x73, 0x3d, + 0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, 0x62, 0x6d, 0x69, + 0x74, 0x73, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, + 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, 0x61, 0x79, 0x20, + 0x73, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, + 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x32, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, + 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, + 0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x22, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, + 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, 0x67, 0x68, 0x74, + 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, + 0x79, 0x70, 0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x22, + 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x20, 0x3e, 0x53, 0x65, 0x6e, 0x64, 0x3c, - 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, - 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, - 0x3d, 0x24, 0x7b, 0x73, 0x74, 0x6f, 0x70, 0x7d, 0x20, 0x64, 0x69, 0x73, - 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, - 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x7d, 0x3e, 0x53, 0x74, 0x6f, 0x70, - 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, - 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, - 0x6b, 0x3d, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, - 0x65, 0x73, 0x65, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3d, 0x20, 0x28, 0x70, - 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, - 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, - 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, - 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, - 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, - 0x20, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x62, - 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20, 0x28, 0x69, 0x66, 0x20, 0x6e, 0x65, - 0x65, 0x64, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x65, - 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x70, - 0x61, 0x72, 0x65, 0x6e, 0x74, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, 0x26, 0x20, - 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, - 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x3c, 0x3d, 0x20, 0x70, - 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, - 0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, - 0x2e, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x48, 0x65, 0x69, 0x67, 0x68, - 0x74, 0x20, 0x2b, 0x20, 0x33, 0x30, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, - 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, - 0x28, 0x30, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, - 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x29, + 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3e, 0x53, 0x65, 0x6e, 0x64, 0x3c, 0x2f, + 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, + 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, + 0x24, 0x7b, 0x73, 0x74, 0x6f, 0x70, 0x7d, 0x20, 0x64, 0x69, 0x73, 0x61, + 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x21, 0x67, 0x65, 0x6e, 0x65, + 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x7d, 0x3e, 0x53, 0x74, 0x6f, 0x70, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, + 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, + 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x72, 0x65, + 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65, 0x73, 0x65, 0x74, 0x3c, 0x2f, + 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, + 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, + 0x67, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, + 0x73, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, + 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, + 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, + 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x63, 0x72, 0x6f, 0x6c, + 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20, + 0x28, 0x69, 0x66, 0x20, 0x6e, 0x65, 0x65, 0x64, 0x65, 0x64, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x63, + 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, + 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x45, + 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, 0x61, 0x72, 0x65, + 0x6e, 0x74, 0x20, 0x26, 0x26, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, + 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, + 0x74, 0x20, 0x3c, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, + 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20, + 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x6f, 0x66, 0x66, 0x73, 0x65, + 0x74, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x2b, 0x20, 0x33, 0x30, + 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, + 0x72, 0x6f, 0x6c, 0x6c, 0x54, 0x6f, 0x28, 0x30, 0x2c, 0x20, 0x70, 0x61, + 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x48, + 0x65, 0x69, 0x67, 0x68, 0x74, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x2c, 0x20, 0x5b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, + 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x20, + 0x3d, 0x20, 0x28, 0x5b, 0x75, 0x73, 0x65, 0x72, 0x2c, 0x20, 0x64, 0x61, + 0x74, 0x61, 0x5d, 0x2c, 0x20, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x4d, 0x65, + 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x41, 0x72, 0x72, 0x61, + 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28, 0x64, 0x61, + 0x74, 0x61, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x66, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, + 0x20, 0x3e, 0x20, 0x30, 0x20, 0x26, 0x26, 0x20, 0x69, 0x73, 0x41, 0x72, + 0x72, 0x61, 0x79, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x68, 0x74, + 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62, + 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x7d, 0x20, 0x64, 0x61, 0x74, + 0x61, 0x3d, 0x24, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x7d, 0x20, 0x2f, 0x3e, + 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, + 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, + 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, + 0x79, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3f, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, + 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, + 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, + 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, + 0x73, 0x2b, 0x2f, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x20, 0x3a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, + 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, + 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x4d, 0x61, 0x72, + 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x7d, 0x20, 0x74, 0x65, + 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x28, 0x74, 0x65, 0x78, 0x74, 0x29, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x6d, 0x65, 0x73, - 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, - 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x5b, 0x75, 0x73, - 0x65, 0x72, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, 0x29, 0x20, 0x3d, 0x3e, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, 0x6b, 0x65, + 0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x3e, 0x3c, + 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x7d, + 0x3a, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x20, 0x24, + 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c, 0x2f, 0x70, + 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, + 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, + 0x73, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x63, 0x68, + 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, + 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, + 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, + 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, + 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74, + 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, + 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, + 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, + 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, + 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, + 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, + 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, + 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, + 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x49, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, + 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, + 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, + 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, + 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, + 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x29, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, 0x73, 0x6f, + 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x70, 0x4f, + 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, + 0x6c, 0x28, 0x27, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, + 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, 0x73, 0x6f, 0x6e, 0x53, + 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, + 0x65, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, + 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, 0x73, 0x6f, 0x6e, + 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x70, 0x4f, 0x72, + 0x64, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, + 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x4a, + 0x53, 0x4f, 0x4e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x47, 0x72, 0x61, + 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, + 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x63, 0x68, + 0x65, 0x6d, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, + 0x61, 0x72, 0x73, 0x65, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, + 0x72, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, + 0x72, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x53, + 0x63, 0x68, 0x65, 0x6d, 0x61, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, + 0x65, 0x72, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, + 0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, + 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x2c, + 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x64, 0x75, 0x63, 0x65, + 0x28, 0x28, 0x61, 0x63, 0x63, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x2c, 0x20, + 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x7b, 0x2e, 0x2e, 0x2e, 0x61, + 0x63, 0x63, 0x2c, 0x20, 0x5b, 0x63, 0x75, 0x72, 0x2e, 0x74, 0x72, 0x69, + 0x6d, 0x28, 0x29, 0x5d, 0x3a, 0x20, 0x69, 0x7d, 0x29, 0x2c, 0x20, 0x7b, + 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x2e, 0x76, + 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, + 0x20, 0x27, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20, 0x63, 0x6f, + 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x2e, 0x66, 0x6f, 0x72, 0x6d, + 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29, 0x2c, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, + 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x65, + 0x72, 0x74, 0x28, 0x60, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x20, + 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x3a, 0x20, 0x24, 0x7b, 0x65, 0x2e, + 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, + 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, 0x20, 0x6d, 0x69, 0x6e, + 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, + 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, - 0x70, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x7d, - 0x3e, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, - 0x29, 0x7d, 0x3a, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, - 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, - 0x69, 0x73, 0x68, 0x7d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, - 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x6d, 0x73, 0x67, - 0x29, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, - 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, - 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, - 0x7b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, - 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c, - 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, - 0x6e, 0x65, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x60, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66, - 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, - 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, - 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, - 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, - 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, - 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, - 0x65, 0x5d, 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, - 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, - 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, - 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, - 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, - 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, - 0x5d, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, - 0x74, 0x28, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, - 0x20, 0x3d, 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, - 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, - 0x5d, 0x3a, 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, - 0x72, 0x28, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, - 0x28, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x29, 0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x46, 0x6c, - 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, - 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, - 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, - 0x73, 0x74, 0x65, 0x70, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, - 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, - 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, - 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, - 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, - 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, - 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, - 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x73, 0x74, 0x65, - 0x70, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x74, 0x65, 0x70, 0x7d, 0x22, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, - 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, - 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, - 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, + 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, + 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, + 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, + 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, + 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, + 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, + 0x61, 0x78, 0x7d, 0x22, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3d, 0x22, 0x24, + 0x7b, 0x73, 0x74, 0x65, 0x70, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, + 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, + 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, + 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x49, + 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, 0x20, + 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, + 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, + 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, + 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, + 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, + 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, + 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, + 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, + 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, + 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, + 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, @@ -672,481 +1331,1008 @@ unsigned char index_html[] = { 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, - 0x3d, 0x20, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, - 0x61, 0x78, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x29, 0x20, 0x3d, + 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, + 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, + 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e, + 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, + 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, + 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x41, 0x6e, + 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x42, + 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, - 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, - 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, 0x3c, 0x2f, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, - 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x6e, 0x67, - 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, - 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, 0x7b, 0x6d, - 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, 0x24, 0x7b, - 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, - 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, - 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, - 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, - 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, - 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, - 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, - 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, + 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, + 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, + 0x3d, 0x3d, 0x20, 0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, + 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, + 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x64, + 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3e, 0x55, 0x73, 0x69, 0x6e, + 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, + 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, + 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, + 0x6b, 0x3d, 0x24, 0x7b, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, + 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, + 0x65, 0x73, 0x65, 0x74, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, + 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, + 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, + 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, + 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x20, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x6f, 0x6e, 0x20, 0x65, 0x76, 0x65, + 0x72, 0x79, 0x20, 0x63, 0x68, 0x61, 0x6e, 0x67, 0x65, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, + 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, + 0x76, 0x65, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x2c, 0x20, 0x5b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, + 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, + 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x55, 0x73, 0x65, 0x72, + 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, + 0x74, 0x42, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, + 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, + 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, + 0x72, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x3e, 0x50, + 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72, - 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, + 0x61, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, + 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, + 0x70, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, + 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x22, 0x20, + 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, + 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, + 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, + 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, + 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x75, 0x73, 0x65, + 0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x74, 0x79, 0x70, - 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, - 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, - 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, - 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, - 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, - 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, - 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, + 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, + 0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, + 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x22, 0x20, + 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, + 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x62, 0x6f, 0x74, 0x22, + 0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, + 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, + 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x63, 0x68, 0x61, + 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, + 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, + 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, + 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, + 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, + 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x50, 0x72, + 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, + 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, + 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, + 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, + 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, + 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, + 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, + 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, - 0x6f, 0x72, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, 0x3e, 0x55, 0x73, - 0x65, 0x72, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, - 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x75, 0x73, 0x65, 0x72, 0x22, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, - 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x75, 0x73, 0x65, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, - 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, - 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, - 0x72, 0x3d, 0x22, 0x62, 0x6f, 0x74, 0x22, 0x3e, 0x42, 0x6f, 0x74, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, + 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x22, 0x3e, 0x43, 0x68, 0x61, 0x74, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, + 0x72, 0x79, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, + 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, + 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, + 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x3d, 0x22, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, + 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, + 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, + 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, + 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x31, 0x20, 0x6f, 0x6e, 0x69, + 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, + 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, + 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x22, 0x3e, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3c, 0x2f, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, + 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x67, 0x72, + 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, + 0x22, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x22, 0x20, 0x70, 0x6c, + 0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x55, + 0x73, 0x65, 0x20, 0x67, 0x62, 0x6e, 0x66, 0x20, 0x6f, 0x72, 0x20, 0x4a, + 0x53, 0x4f, 0x4e, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2b, 0x63, + 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, + 0x72, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, + 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, + 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3d, 0x22, 0x63, 0x68, 0x61, 0x72, 0x22, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, - 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x63, 0x68, 0x61, - 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, - 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, + 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x70, 0x2d, 0x6f, 0x72, 0x64, + 0x65, 0x72, 0x22, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, + 0x64, 0x65, 0x72, 0x3d, 0x22, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, + 0x70, 0x72, 0x6f, 0x70, 0x31, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x32, 0x2c, + 0x70, 0x72, 0x6f, 0x70, 0x33, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, + 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x47, + 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, 0x73, 0x6f, 0x6e, 0x53, 0x63, + 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, + 0x72, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, + 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x62, 0x75, + 0x74, 0x74, 0x6f, 0x6e, 0x22, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, + 0x6b, 0x3d, 0x24, 0x7b, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x4a, + 0x53, 0x4f, 0x4e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x47, 0x72, 0x61, + 0x6d, 0x6d, 0x61, 0x72, 0x7d, 0x3e, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, + 0x74, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x22, 0x3e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, - 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, - 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, - 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, - 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, - 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x43, 0x68, 0x61, - 0x74, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x20, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, - 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x68, - 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, - 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, - 0x73, 0x3d, 0x31, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, + 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, + 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, + 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, - 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, - 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, - 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, - 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x65, - 0x64, 0x69, 0x63, 0x74, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, - 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, - 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, - 0x20, 0x31, 0x2e, 0x35, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, - 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, - 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, - 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, - 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x7d, 0x29, 0x7d, 0x0a, + 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, + 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, + 0x65, 0x64, 0x69, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x22, 0x2c, 0x20, + 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, + 0x69, 0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, + 0x3a, 0x20, 0x22, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, + 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, + 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, + 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, - 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x65, - 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, - 0x74, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x22, 0x2c, - 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20, 0x6d, - 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, - 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, + 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x65, + 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, + 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x35, 0x2c, 0x20, 0x6d, 0x69, + 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, + 0x3a, 0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, + 0x72, 0x65, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, + 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, + 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, + 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, + 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, + 0x20, 0x22, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x20, 0x72, + 0x65, 0x70, 0x65, 0x61, 0x74, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x63, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, + 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, + 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, + 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, + 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, + 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, + 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, + 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x43, + 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, + 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, + 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, + 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, + 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, + 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, + 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, + 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, + 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, + 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, + 0x6f, 0x70, 0x2d, 0x4b, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, + 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x30, + 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, + 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x22, + 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, + 0x70, 0x5f, 0x6b, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, + 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x50, 0x20, 0x73, + 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, + 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, + 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, + 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, - 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, - 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x43, 0x6f, 0x6e, 0x73, 0x69, - 0x64, 0x65, 0x72, 0x20, 0x4e, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, - 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, - 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, - 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, - 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, - 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x22, 0x2c, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, - 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x5f, 0x6e, 0x7d, 0x29, 0x7d, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, - 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x4b, - 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, - 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x2c, 0x20, 0x6d, 0x69, - 0x6e, 0x3a, 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, - 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x22, 0x2c, 0x20, 0x76, 0x61, + 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x7d, 0x29, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, + 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x75, 0x6d, + 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x4d, 0x6f, 0x72, 0x65, 0x20, 0x6f, 0x70, + 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x3c, 0x2f, 0x73, 0x75, 0x6d, 0x6d, 0x61, + 0x72, 0x79, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, + 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, + 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, + 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, + 0x3a, 0x20, 0x22, 0x54, 0x46, 0x53, 0x2d, 0x5a, 0x22, 0x2c, 0x20, 0x6d, + 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, + 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, + 0x20, 0x22, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x22, 0x2c, 0x20, 0x73, 0x74, + 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x7d, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, - 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, - 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, - 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, - 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, - 0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, - 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, - 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69, - 0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, - 0x3e, 0x4d, 0x6f, 0x72, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, - 0x73, 0x3c, 0x2f, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, - 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, - 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, - 0x46, 0x53, 0x2d, 0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, - 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, - 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x66, - 0x73, 0x5f, 0x7a, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, + 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, + 0x3a, 0x20, 0x22, 0x54, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, + 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, + 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, + 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, + 0x6c, 0x5f, 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x7d, 0x29, 0x7d, 0x0a, 0x20, + 0x65, 0x2e, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x7d, + 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, + 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, + 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x20, + 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, + 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, + 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, + 0x22, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, + 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, + 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, + 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, - 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, - 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22, 0x2c, 0x20, 0x6d, - 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, - 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, - 0x20, 0x22, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x22, + 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, + 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, + 0x46, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x20, 0x70, 0x65, + 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, + 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, + 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x66, + 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, + 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, + 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x2e, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, + 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x68, 0x72, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, + 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, + 0x74, 0x68, 0x72, 0x65, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, + 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, + 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, + 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, + 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, + 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x30, 0x22, + 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, + 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, + 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, + 0x30, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, + 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x6e, 0x6f, 0x20, + 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, + 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, + 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, + 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x31, + 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, + 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, + 0x20, 0x31, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, + 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, + 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x76, 0x31, 0x3c, 0x2f, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, + 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, + 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, + 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, + 0x32, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, + 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, + 0x3d, 0x20, 0x32, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, + 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, + 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x76, 0x32, 0x3c, 0x2f, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, + 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, + 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, + 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, + 0x20, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, + 0x31, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, + 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, + 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, - 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x7d, 0x29, 0x7d, 0x0a, 0x20, + 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, + 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x7d, 0x29, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, + 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, + 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x65, + 0x74, 0x61, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, + 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, + 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, + 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x73, + 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, + 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x7d, 0x29, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, - 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, - 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x70, 0x65, 0x6e, 0x61, - 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, - 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, 0x72, 0x65, - 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, - 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, - 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, - 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x46, 0x72, 0x65, 0x71, - 0x75, 0x65, 0x6e, 0x63, 0x79, 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, - 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, - 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x66, 0x72, 0x65, 0x71, 0x75, - 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, - 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x66, - 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, - 0x61, 0x6c, 0x74, 0x79, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x72, 0x20, - 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, - 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x68, 0x72, 0x65, - 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, + 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, + 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x53, 0x68, 0x6f, 0x77, + 0x20, 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, + 0x65, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, + 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, + 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, + 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, + 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, + 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, + 0x74, 0x61, 0x69, 0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x70, 0x72, 0x6f, 0x62, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x3d, 0x20, + 0x28, 0x70, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x20, 0x3d, + 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, + 0x31, 0x39, 0x32, 0x20, 0x2a, 0x20, 0x28, 0x31, 0x20, 0x2d, 0x20, 0x70, + 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x67, 0x20, 0x3d, 0x20, 0x4d, 0x61, 0x74, 0x68, + 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x31, 0x39, 0x32, 0x20, 0x2a, + 0x20, 0x70, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x60, 0x72, 0x67, 0x62, 0x61, 0x28, + 0x24, 0x7b, 0x72, 0x7d, 0x2c, 0x24, 0x7b, 0x67, 0x7d, 0x2c, 0x30, 0x2c, + 0x30, 0x2e, 0x33, 0x29, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x50, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, + 0x73, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, + 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, + 0x73, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, + 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, + 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, + 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, + 0x20, 0x7d, 0x20, 0x3d, 0x20, 0x6d, 0x73, 0x67, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x21, 0x63, 0x6f, + 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, + 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x7c, + 0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, + 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, + 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, + 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, + 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6d, 0x70, + 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, + 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, + 0x4e, 0x6f, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x62, 0x79, 0x74, 0x65, + 0x20, 0x70, 0x61, 0x69, 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6d, 0x70, + 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, + 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x5b, 0x30, 0x5d, 0x2e, + 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x74, 0x61, 0x72, + 0x74, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x62, 0x79, 0x74, 0x65, + 0x3a, 0x20, 0x5c, 0x5c, 0x27, 0x29, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x6e, 0x74, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x70, 0x6c, 0x69, + 0x74, 0x44, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, + 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, + 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2e, 0x6d, 0x61, 0x70, + 0x28, 0x70, 0x72, 0x6f, 0x62, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, - 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, - 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, - 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x30, 0x22, 0x20, 0x63, 0x68, 0x65, - 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, - 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x30, 0x7d, 0x20, 0x6f, - 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, - 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x6e, 0x6f, 0x20, 0x4d, 0x69, 0x72, 0x6f, - 0x73, 0x74, 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, - 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, - 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, - 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x31, 0x22, 0x20, 0x63, 0x68, - 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, - 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x31, 0x7d, 0x20, - 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, - 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x20, 0x76, 0x31, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, - 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x22, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x32, 0x22, 0x20, 0x63, - 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, - 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, 0x20, 0x32, 0x7d, - 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, - 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, - 0x74, 0x61, 0x74, 0x20, 0x76, 0x32, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, + 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3a, 0x20, 0x70, 0x72, 0x6f, + 0x62, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2c, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, + 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x3a, + 0x20, 0x5b, 0x70, 0x72, 0x6f, 0x62, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x29, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, + 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, + 0x7d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x24, 0x7b, 0x73, 0x70, 0x6c, + 0x69, 0x74, 0x44, 0x61, 0x74, 0x61, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x7b, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x2c, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x3d, 0x20, 0x63, 0x6f, + 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, + 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x5b, 0x30, + 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x20, 0x3d, 0x20, + 0x70, 0x72, 0x6f, 0x62, 0x73, 0x2e, 0x66, 0x69, 0x6e, 0x64, 0x28, 0x70, + 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x2e, 0x74, 0x6f, 0x6b, 0x5f, 0x73, 0x74, + 0x72, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x43, 0x6f, + 0x6c, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x20, + 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x28, + 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x29, 0x20, + 0x3a, 0x20, 0x27, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x61, 0x72, 0x65, + 0x6e, 0x74, 0x27, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, + 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x3d, + 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c, + 0x61, 0x73, 0x73, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x62, 0x2d, 0x73, 0x65, + 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x2e, + 0x6d, 0x61, 0x70, 0x28, 0x28, 0x70, 0x2c, 0x20, 0x69, 0x6e, 0x64, 0x65, + 0x78, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, - 0x6c, 0x64, 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, - 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x74, 0x61, 0x75, - 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2e, 0x30, - 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, - 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74, - 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, - 0x28, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, - 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c, - 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, - 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, - 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, - 0x5f, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, - 0x65, 0x74, 0x61, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, - 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, - 0x6c, 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, 0x61, - 0x6e, 0x73, 0x20, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, - 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x61, - 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, 0x20, - 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x6d, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x74, 0x65, 0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, - 0x2f, 0x26, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, - 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, - 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, - 0x2c, 0x20, 0x27, 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, + 0x78, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x69, 0x74, + 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x60, 0x70, 0x72, 0x6f, 0x62, 0x3a, 0x20, + 0x24, 0x7b, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x7d, 0x60, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, + 0x24, 0x7b, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x27, 0x30, 0x2e, + 0x33, 0x65, 0x6d, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, + 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x2e, 0x74, 0x6f, 0x6b, + 0x5f, 0x73, 0x74, 0x72, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x43, + 0x6f, 0x6c, 0x6f, 0x72, 0x28, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x29, + 0x20, 0x3a, 0x20, 0x27, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x61, 0x72, + 0x65, 0x6e, 0x74, 0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x70, 0x2e, + 0x74, 0x6f, 0x6b, 0x5f, 0x73, 0x74, 0x72, 0x7d, 0x3a, 0x20, 0x3c, 0x2f, + 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x4d, 0x61, 0x74, 0x68, + 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x70, 0x2e, 0x70, 0x72, 0x6f, + 0x62, 0x20, 0x2a, 0x20, 0x31, 0x30, 0x30, 0x29, 0x7d, 0x25, 0x3c, 0x2f, + 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x7d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, + 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x60, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, + 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x7d, 0x20, + 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x62, 0x61, + 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x43, 0x6f, 0x6c, 0x6f, + 0x72, 0x3a, 0x20, 0x70, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x7d, 0x7d, + 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, + 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x24, 0x7b, 0x70, 0x6f, 0x70, 0x6f, 0x76, + 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x24, 0x7b, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x6e, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x28, 0x2f, 0x5c, 0x6e, + 0x2f, 0x67, 0x69, 0x6d, 0x29, 0x20, 0x3f, 0x20, 0x68, 0x74, 0x6d, 0x6c, + 0x60, 0x3c, 0x62, 0x72, 0x20, 0x2f, 0x3e, 0x60, 0x20, 0x3a, 0x20, 0x6d, + 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, + 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, 0x61, 0x6e, 0x73, 0x20, 0x6d, 0x61, + 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, + 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, + 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x64, 0x20, + 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x74, 0x65, 0x78, + 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, + 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x26, 0x2f, 0x67, 0x2c, + 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, - 0x63, 0x65, 0x28, 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, + 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, - 0x23, 0x7b, 0x31, 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, - 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, - 0x31, 0x3c, 0x2f, 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, - 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, - 0x5c, 0x2a, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, - 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, - 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, - 0x2f, 0x5f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, - 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, - 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, - 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, - 0x3f, 0x29, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, - 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, - 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, - 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, - 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, - 0x2f, 0x60, 0x60, 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, - 0x73, 0x5c, 0x53, 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, - 0x2c, 0x20, 0x27, 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, - 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, - 0x2f, 0x70, 0x72, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3e, + 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x67, 0x74, 0x3b, 0x27, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, + 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x23, 0x7b, 0x31, 0x2c, 0x36, + 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x69, 0x6d, 0x2c, + 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x68, 0x33, + 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, + 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x5c, 0x2a, 0x2f, + 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, + 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, + 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, + 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x5f, 0x28, 0x2e, + 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, + 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, + 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x28, 0x2f, 0x60, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, - 0x20, 0x27, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, - 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x28, 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, - 0x62, 0x72, 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, - 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, - 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, - 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, - 0x20, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, - 0x7d, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, - 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x21, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x2f, + 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, + 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, + 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, + 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, + 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, + 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x60, 0x60, 0x2e, + 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, 0x5c, 0x73, 0x5c, 0x53, 0x5d, 0x2a, + 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x70, + 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, + 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x3c, 0x2f, 0x70, 0x72, 0x65, 0x3e, + 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x60, 0x28, 0x2e, + 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x63, 0x6f, + 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, + 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, + 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x6e, 0x2f, + 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x62, 0x72, 0x20, 0x2f, 0x3e, + 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, + 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, + 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, + 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, 0x7b, 0x20, 0x5f, 0x5f, 0x68, 0x74, + 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, 0x20, 0x7d, 0x7d, 0x20, 0x2f, 0x3e, + 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, 0x6f, 0x64, + 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, + 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, + 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, + 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, + 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x74, 0x6f, + 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, + 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, + 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, + 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, + 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, + 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x2e, 0x74, + 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32, 0x29, 0x7d, 0x20, 0x74, + 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x73, 0x65, + 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, + 0x65, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x69, 0x6d, + 0x70, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x28, + 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, + 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x66, 0x61, 0x6c, 0x73, 0x65, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, + 0x7b, 0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x27, 0x30, 0x70, 0x78, 0x27, + 0x2c, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, 0x27, 0x30, 0x70, 0x78, + 0x27, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, + 0x52, 0x65, 0x66, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, + 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x6f, 0x70, 0x6f, + 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, + 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x3b, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x74, 0x6f, 0x67, 0x67, 0x6c, 0x65, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, + 0x72, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, + 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, + 0x72, 0x72, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x72, 0x65, 0x63, 0x74, 0x20, 0x3d, 0x20, 0x62, 0x75, 0x74, 0x74, + 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, + 0x74, 0x2e, 0x67, 0x65, 0x74, 0x42, 0x6f, 0x75, 0x6e, 0x64, 0x69, 0x6e, + 0x67, 0x43, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x63, 0x74, 0x28, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x3a, + 0x20, 0x60, 0x24, 0x7b, 0x72, 0x65, 0x63, 0x74, 0x2e, 0x62, 0x6f, 0x74, + 0x74, 0x6f, 0x6d, 0x20, 0x2b, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, + 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x59, 0x7d, 0x70, 0x78, 0x60, + 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, 0x60, 0x24, 0x7b, 0x72, + 0x65, 0x63, 0x74, 0x2e, 0x6c, 0x65, 0x66, 0x74, 0x20, 0x2b, 0x20, 0x77, + 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, + 0x58, 0x7d, 0x70, 0x78, 0x60, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x20, 0x3d, 0x20, 0x21, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, + 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, + 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, + 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x66, 0x20, 0x28, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, + 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, + 0x26, 0x20, 0x21, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, + 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x6f, + 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x73, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, + 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x29, 0x20, 0x26, 0x26, 0x20, + 0x21, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, + 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x73, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, + 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x66, 0x61, + 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, + 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, + 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, + 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x27, + 0x6d, 0x6f, 0x75, 0x73, 0x65, 0x64, 0x6f, 0x77, 0x6e, 0x27, 0x2c, 0x20, + 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, + 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, 0x3e, - 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, + 0x6e, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x76, 0x65, + 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x27, + 0x6d, 0x6f, 0x75, 0x73, 0x65, 0x64, 0x6f, 0x77, 0x6e, 0x27, 0x2c, 0x20, + 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, + 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x73, 0x74, 0x79, 0x6c, + 0x65, 0x3d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x74, + 0x79, 0x6c, 0x65, 0x7d, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x62, + 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x7d, 0x20, 0x6f, 0x6e, + 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x74, 0x6f, 0x67, 0x67, + 0x6c, 0x65, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x7d, 0x3e, 0x24, + 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, + 0x72, 0x65, 0x6e, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x69, 0x73, + 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x26, + 0x26, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, + 0x74, 0x61, 0x6c, 0x7d, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x3d, 0x22, 0x23, + 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, + 0x76, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x70, 0x6f, + 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x70, 0x6f, 0x70, 0x6f, 0x76, + 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x70, 0x6f, 0x73, + 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, + 0x74, 0x6f, 0x70, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, + 0x74, 0x3a, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6c, 0x65, 0x66, 0x74, 0x2c, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x70, + 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, + 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, + 0x6c, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x60, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x20, 0x53, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x3a, 0x20, 0x70, + 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, + 0x20, 0x28, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, + 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x64, 0x65, 0x76, + 0x65, 0x6c, 0x6f, 0x70, 0x69, 0x74, 0x2f, 0x70, 0x72, 0x65, 0x61, 0x63, + 0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x2f, 0x62, 0x6c, 0x6f, + 0x62, 0x2f, 0x6d, 0x61, 0x73, 0x74, 0x65, 0x72, 0x2f, 0x73, 0x72, 0x63, + 0x2f, 0x70, 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, + 0x61, 0x6c, 0x2e, 0x6a, 0x73, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2a, 0x2a, 0x20, 0x52, 0x65, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x20, + 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x6f, 0x66, + 0x20, 0x64, 0x65, 0x73, 0x63, 0x65, 0x6e, 0x64, 0x61, 0x6e, 0x74, 0x73, + 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x67, 0x69, + 0x76, 0x65, 0x6e, 0x20, 0x43, 0x53, 0x53, 0x20, 0x73, 0x65, 0x6c, 0x65, + 0x63, 0x74, 0x6f, 0x72, 0x20, 0x2a, 0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, + 0x20, 0x65, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x73, 0x20, 0x43, 0x6f, 0x6d, + 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, + 0x44, 0x69, 0x64, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x70, 0x72, + 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x6c, 0x65, 0x74, 0x20, + 0x69, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x69, 0x66, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x69, 0x5d, + 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, + 0x6f, 0x70, 0x73, 0x5b, 0x69, 0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, + 0x6f, 0x75, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, + 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, + 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, + 0x73, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x74, + 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, + 0x4c, 0x61, 0x79, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, + 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, + 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, + 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, + 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, + 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x66, 0x61, + 0x6c, 0x73, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x73, 0x4d, 0x6f, 0x75, + 0x6e, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, + 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, + 0x65, 0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, + 0x6d, 0x6f, 0x74, 0x65, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, + 0x6f, 0x64, 0x65, 0x29, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, + 0x6d, 0x6f, 0x74, 0x65, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, + 0x6f, 0x64, 0x65, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x43, 0x68, + 0x69, 0x6c, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, + 0x6f, 0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x6e, + 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x28, 0x6e, 0x6f, 0x64, 0x65, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, + 0x6e, 0x6f, 0x64, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x73, 0x74, + 0x72, 0x69, 0x6e, 0x67, 0x27, 0x20, 0x3f, 0x20, 0x64, 0x6f, 0x63, 0x75, + 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x71, 0x75, 0x65, 0x72, 0x79, 0x53, 0x65, + 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x6e, 0x6f, 0x64, 0x65, 0x29, + 0x20, 0x3a, 0x20, 0x6e, 0x6f, 0x64, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, + 0x73, 0x68, 0x6f, 0x77, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x66, 0x20, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x73, 0x4d, + 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x6c, 0x65, 0x61, 0x6e, 0x20, 0x75, 0x70, + 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x6e, 0x6f, 0x64, 0x65, 0x20, 0x69, 0x66, + 0x20, 0x6d, 0x6f, 0x76, 0x69, 0x6e, 0x67, 0x20, 0x62, 0x61, 0x73, 0x65, + 0x73, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, + 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, + 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x50, 0x6f, 0x69, 0x6e, + 0x74, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, + 0x74, 0x6f, 0x50, 0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, + 0x6e, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, 0x3d, + 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x74, 0x6d, 0x6c, + 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, + 0x6f, 0x78, 0x79, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x2c, 0x20, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x2c, 0x20, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x66, 0x69, 0x6e, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x28, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, 0x6e, + 0x74, 0x6f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, + 0x3d, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x74, 0x6d, + 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, + 0x6f, 0x78, 0x79, 0x7d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, + 0x3d, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x78, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x73, 0x68, 0x6f, 0x77, + 0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, + 0x70, 0x73, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, + 0x7c, 0x7c, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x24, 0x7b, 0x50, + 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x2c, 0x20, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x2c, 0x20, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x68, 0x69, 0x67, + 0x68, 0x2d, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x63, 0x6f, 0x6d, 0x70, + 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, + 0x65, 0x6e, 0x64, 0x65, 0x72, 0x73, 0x20, 0x69, 0x74, 0x73, 0x20, 0x66, + 0x69, 0x72, 0x73, 0x74, 0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x20, 0x69, + 0x66, 0x20, 0x69, 0x74, 0x20, 0x65, 0x78, 0x69, 0x73, 0x74, 0x73, 0x2e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x75, 0x73, 0x65, 0x64, + 0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x64, 0x69, 0x74, + 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, + 0x69, 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x78, 0x79, 0x2e, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x50, 0x6f, 0x72, + 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x20, 0x65, 0x78, 0x74, + 0x65, 0x6e, 0x64, 0x73, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, + 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, + 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, + 0x78, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x78, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, + 0x65, 0x72, 0x28, 0x7b, 0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, + 0x6e, 0x20, 0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x68, + 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7c, 0x7c, 0x20, 0x6e, 0x75, + 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, + 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, - 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, - 0x65, 0x72, 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, 0x6d, 0x73, 0x2e, - 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, 0x7d, 0x6d, 0x73, - 0x20, 0x70, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x2c, 0x20, - 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, - 0x63, 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, - 0x6f, 0x6e, 0x64, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, - 0x32, 0x29, 0x7d, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, - 0x65, 0x72, 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, - 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, - 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, - 0x61, 0x69, 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, + 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, + 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, 0x61, + 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, 0x64, + 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x3c, 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, 0x74, + 0x65, 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, 0x43, + 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6e, + 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, + 0x6d, 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, 0x22, + 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, + 0x49, 0x6e, 0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x68, 0x65, - 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x69, - 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x22, 0x3e, + 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64, + 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, + 0x49, 0x6e, 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, - 0x74, 0x65, 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3f, 0x20, - 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, 0x43, 0x6f, - 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, 0x2f, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x6d, 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, 0x74, 0x65, - 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, - 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, - 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, - 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, - 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, - 0x20, 0x62, 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, - 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, - 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, - 0x67, 0x61, 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, - 0x63, 0x70, 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, - 0x70, 0x70, 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, - 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, - 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, - 0x3e, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, - 0x2e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, - 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, - 0x70, 0x29, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, - 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, - 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, - 0x61, 0x64, 0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, - 0x3c, 0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, - 0x74, 0x6d, 0x6c, 0x3e, 0x0a + 0x20, 0x3c, 0x70, 0x3e, 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20, + 0x62, 0x79, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, + 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, + 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67, + 0x61, 0x6e, 0x6f, 0x76, 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, + 0x70, 0x70, 0x22, 0x3e, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, + 0x70, 0x3c, 0x2f, 0x61, 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61, + 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, + 0x3a, 0x2f, 0x2f, 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e, + 0x67, 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e, + 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, + 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70, + 0x29, 0x2c, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, + 0x71, 0x75, 0x65, 0x72, 0x79, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, + 0x72, 0x28, 0x27, 0x23, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, + 0x72, 0x27, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, + 0x3e, 0x0a, 0x0a, 0x3c, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x20, 0x20, + 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, + 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x22, 0x3e, 0x3c, 0x2f, 0x64, 0x69, + 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, + 0x3d, 0x22, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x22, 0x3e, 0x3c, 0x2f, + 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x3c, 0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, + 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a }; -unsigned int index_html_len = 13781; +unsigned int index_html_len = 28018; diff --git a/examples/server/index.js.hpp b/examples/server/index.js.hpp index a3b5be6d887b85409c72e2c3b891c9fb382ee0c7..c9dc078b77988653b4338a86e27037ed4d5f3dd2 100644 --- a/examples/server/index.js.hpp +++ b/examples/server/index.js.hpp @@ -4,1555 +4,1578 @@ unsigned char index_js[] = { 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x43, 0x79, 0x63, 0x6c, 0x65, 0x20, 0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x65, 0x64, 0x22, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6e, 0x28, 0x29, 0x7b, - 0x69, 0x66, 0x28, 0x6f, 0x3e, 0x31, 0x29, 0x7b, 0x6f, 0x2d, 0x2d, 0x3b, + 0x69, 0x66, 0x28, 0x75, 0x3e, 0x31, 0x29, 0x7b, 0x75, 0x2d, 0x2d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7d, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x2c, 0x6e, 0x3d, 0x21, 0x31, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x5f, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x3d, 0x5f, 0x3b, 0x5f, 0x3d, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x3b, 0x72, 0x2b, 0x2b, 0x3b, 0x77, 0x68, 0x69, + 0x69, 0x64, 0x20, 0x30, 0x3b, 0x66, 0x2b, 0x2b, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x69, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x69, 0x2e, 0x6f, 0x3b, 0x69, 0x2e, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x3b, 0x69, 0x66, - 0x28, 0x21, 0x28, 0x38, 0x26, 0x69, 0x2e, 0x66, 0x29, 0x26, 0x26, 0x63, + 0x28, 0x21, 0x28, 0x38, 0x26, 0x69, 0x2e, 0x66, 0x29, 0x26, 0x26, 0x61, 0x28, 0x69, 0x29, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x69, 0x2e, 0x63, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x6e, 0x29, 0x7b, 0x74, 0x3d, 0x65, 0x3b, 0x6e, 0x3d, - 0x21, 0x30, 0x7d, 0x7d, 0x69, 0x3d, 0x5f, 0x7d, 0x7d, 0x72, 0x3d, 0x30, - 0x3b, 0x6f, 0x2d, 0x2d, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x74, 0x68, + 0x21, 0x30, 0x7d, 0x7d, 0x69, 0x3d, 0x5f, 0x7d, 0x7d, 0x66, 0x3d, 0x30, + 0x3b, 0x75, 0x2d, 0x2d, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x65, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x6f, + 0x6f, 0x6e, 0x20, 0x65, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x75, 0x3e, 0x30, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, - 0x29, 0x3b, 0x6f, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x72, 0x65, + 0x29, 0x3b, 0x75, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x6e, 0x28, 0x29, 0x7d, 0x7d, 0x6c, 0x65, - 0x74, 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x3d, 0x30, 0x2c, 0x72, 0x3d, - 0x30, 0x2c, 0x75, 0x3d, 0x30, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x6c, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, - 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x6e, 0x2e, 0x74, 0x21, 0x3d, 0x3d, - 0x69, 0x29, 0x7b, 0x6e, 0x3d, 0x7b, 0x69, 0x3a, 0x30, 0x2c, 0x53, 0x3a, - 0x74, 0x2c, 0x70, 0x3a, 0x69, 0x2e, 0x73, 0x2c, 0x6e, 0x3a, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x2c, 0x74, 0x3a, 0x69, 0x2c, 0x65, 0x3a, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x78, 0x3a, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x2c, 0x72, 0x3a, 0x6e, 0x7d, 0x3b, 0x69, 0x66, 0x28, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x69, 0x2e, 0x73, 0x29, - 0x69, 0x2e, 0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d, - 0x6e, 0x3b, 0x74, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x33, - 0x32, 0x26, 0x69, 0x2e, 0x66, 0x29, 0x74, 0x2e, 0x53, 0x28, 0x6e, 0x29, - 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x2d, 0x31, 0x3d, 0x3d, 0x3d, 0x6e, - 0x2e, 0x69, 0x29, 0x7b, 0x6e, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x66, - 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, - 0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x6e, 0x2e, 0x70, 0x3d, 0x6e, 0x2e, 0x70, - 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, - 0x3d, 0x6e, 0x2e, 0x70, 0x29, 0x6e, 0x2e, 0x70, 0x2e, 0x6e, 0x3d, 0x6e, - 0x2e, 0x6e, 0x3b, 0x6e, 0x2e, 0x70, 0x3d, 0x69, 0x2e, 0x73, 0x3b, 0x6e, - 0x2e, 0x6e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x2e, - 0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d, 0x6e, 0x7d, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x28, 0x74, 0x29, 0x7b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6e, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x66, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x68, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x66, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, - 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x21, 0x3d, 0x3d, 0x74, - 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, - 0x2e, 0x65, 0x29, 0x7b, 0x74, 0x2e, 0x78, 0x3d, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x74, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x74, 0x2e, 0x65, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x74, 0x3d, 0x74, 0x7d, 0x7d, 0x3b, 0x66, 0x2e, 0x70, 0x72, - 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x55, 0x3d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, - 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6e, 0x3d, 0x74, 0x2e, 0x65, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x78, 0x3b, + 0x74, 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x3d, 0x30, 0x3b, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x72, 0x28, 0x74, 0x29, 0x7b, + 0x69, 0x66, 0x28, 0x6f, 0x3e, 0x30, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x74, 0x28, 0x29, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x6e, 0x3d, 0x69, 0x3b, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, + 0x3b, 0x6f, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x7d, 0x66, 0x69, 0x6e, 0x61, + 0x6c, 0x6c, 0x79, 0x7b, 0x6f, 0x2d, 0x2d, 0x3b, 0x69, 0x3d, 0x6e, 0x7d, + 0x7d, 0x6c, 0x65, 0x74, 0x20, 0x75, 0x3d, 0x30, 0x2c, 0x66, 0x3d, 0x30, + 0x2c, 0x6c, 0x3d, 0x30, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x73, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, + 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, + 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, + 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x6e, 0x2e, 0x74, 0x21, 0x3d, 0x3d, 0x69, + 0x29, 0x7b, 0x6e, 0x3d, 0x7b, 0x69, 0x3a, 0x30, 0x2c, 0x53, 0x3a, 0x74, + 0x2c, 0x70, 0x3a, 0x69, 0x2e, 0x73, 0x2c, 0x6e, 0x3a, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x2c, 0x74, 0x3a, 0x69, 0x2c, 0x65, 0x3a, 0x76, 0x6f, + 0x69, 0x64, 0x20, 0x30, 0x2c, 0x78, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x2c, 0x72, 0x3a, 0x6e, 0x7d, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, + 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x69, 0x2e, 0x73, 0x29, 0x69, + 0x2e, 0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d, 0x6e, + 0x3b, 0x74, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x33, 0x32, + 0x26, 0x69, 0x2e, 0x66, 0x29, 0x74, 0x2e, 0x53, 0x28, 0x6e, 0x29, 0x3b, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x65, 0x6c, 0x73, + 0x65, 0x20, 0x69, 0x66, 0x28, 0x2d, 0x31, 0x3d, 0x3d, 0x3d, 0x6e, 0x2e, + 0x69, 0x29, 0x7b, 0x6e, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x66, 0x28, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x6e, + 0x29, 0x7b, 0x6e, 0x2e, 0x6e, 0x2e, 0x70, 0x3d, 0x6e, 0x2e, 0x70, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, - 0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x78, 0x3d, 0x65, 0x3b, 0x74, 0x2e, 0x65, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x65, - 0x2e, 0x65, 0x3d, 0x6e, 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x74, 0x3d, 0x3d, 0x3d, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, - 0x3d, 0x65, 0x7d, 0x7d, 0x3b, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, - 0x62, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, - 0x68, 0x69, 0x73, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x62, - 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, - 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x69, 0x3d, 0x33, 0x32, 0x26, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, - 0x3d, 0x2d, 0x33, 0x33, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x28, 0x65, - 0x29, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x69, 0x7d, 0x7d, 0x29, 0x29, 0x7d, - 0x3b, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x4f, 0x66, 0x3d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x7d, 0x3b, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, - 0x70, 0x65, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2b, 0x22, 0x22, 0x7d, 0x3b, 0x66, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x74, 0x6f, 0x4a, - 0x53, 0x4f, 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3b, 0x66, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, 0x65, - 0x65, 0x6b, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, + 0x6e, 0x2e, 0x70, 0x29, 0x6e, 0x2e, 0x70, 0x2e, 0x6e, 0x3d, 0x6e, 0x2e, + 0x6e, 0x3b, 0x6e, 0x2e, 0x70, 0x3d, 0x69, 0x2e, 0x73, 0x3b, 0x6e, 0x2e, + 0x6e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x2e, 0x73, + 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d, 0x6e, 0x7d, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x63, 0x28, 0x74, 0x29, 0x7b, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6e, 0x3d, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x63, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x68, 0x3d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x63, 0x2e, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, + 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x21, 0x3d, 0x3d, 0x74, 0x26, + 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, + 0x65, 0x29, 0x7b, 0x74, 0x2e, 0x78, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x74, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, + 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x74, 0x2e, 0x65, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x74, 0x3d, 0x74, 0x7d, 0x7d, 0x3b, 0x63, 0x2e, 0x70, 0x72, 0x6f, + 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x55, 0x3d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, + 0x3d, 0x74, 0x2e, 0x65, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x78, 0x3b, 0x69, + 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, + 0x29, 0x7b, 0x6e, 0x2e, 0x78, 0x3d, 0x65, 0x3b, 0x74, 0x2e, 0x65, 0x3d, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x76, 0x6f, + 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x65, 0x2e, + 0x65, 0x3d, 0x6e, 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, 0x6f, 0x69, 0x64, + 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x74, 0x3d, 0x3d, 0x3d, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x74, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3d, + 0x65, 0x7d, 0x7d, 0x3b, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, + 0x79, 0x70, 0x65, 0x2e, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, + 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, + 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x68, + 0x69, 0x73, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x53, 0x28, + 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2c, 0x69, 0x3d, 0x33, 0x32, 0x26, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x66, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, + 0x2d, 0x33, 0x33, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x28, 0x65, 0x29, + 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x69, 0x7d, 0x7d, 0x29, 0x29, 0x7d, 0x3b, + 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x4f, 0x66, 0x3d, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x7d, 0x3b, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, + 0x65, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x2b, 0x22, 0x22, 0x7d, 0x3b, 0x63, 0x2e, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x74, 0x6f, 0x4a, 0x53, + 0x4f, 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x76, 0x7d, 0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, - 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, - 0x74, 0x79, 0x28, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, - 0x70, 0x65, 0x2c, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x7b, - 0x67, 0x65, 0x74, 0x28, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x74, 0x3d, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x66, - 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x29, - 0x74, 0x2e, 0x69, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, - 0x7d, 0x2c, 0x73, 0x65, 0x74, 0x28, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, - 0x69, 0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x6f, 0x66, - 0x20, 0x70, 0x29, 0x21, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x7b, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, - 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70, - 0x75, 0x74, 0x65, 0x64, 0x20, 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x20, - 0x68, 0x61, 0x76, 0x65, 0x20, 0x73, 0x69, 0x64, 0x65, 0x2d, 0x65, 0x66, - 0x66, 0x65, 0x63, 0x74, 0x73, 0x22, 0x29, 0x7d, 0x28, 0x29, 0x3b, 0x69, - 0x66, 0x28, 0x65, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, - 0x29, 0x7b, 0x69, 0x66, 0x28, 0x72, 0x3e, 0x31, 0x30, 0x30, 0x29, 0x74, - 0x28, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x65, 0x3b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x3b, 0x75, 0x2b, 0x2b, - 0x3b, 0x6f, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x66, 0x6f, 0x72, - 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x74, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, - 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x78, 0x29, 0x74, 0x2e, 0x74, 0x2e, 0x4e, - 0x28, 0x29, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x6e, - 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x73, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x66, 0x28, 0x74, - 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x63, - 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, - 0x6e, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x21, 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x69, - 0x66, 0x28, 0x6e, 0x2e, 0x53, 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, - 0x69, 0x7c, 0x7c, 0x21, 0x6e, 0x2e, 0x53, 0x2e, 0x68, 0x28, 0x29, 0x7c, - 0x7c, 0x6e, 0x2e, 0x53, 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x69, - 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, - 0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, - 0x6e, 0x2e, 0x6e, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, - 0x3d, 0x6e, 0x2e, 0x53, 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x6e, 0x2e, 0x72, - 0x3d, 0x65, 0x3b, 0x6e, 0x2e, 0x53, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x6e, - 0x2e, 0x69, 0x3d, 0x2d, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x7b, 0x74, - 0x2e, 0x73, 0x3d, 0x6e, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x7d, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x61, 0x28, - 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x74, - 0x2e, 0x73, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x74, 0x3d, 0x65, 0x2e, 0x70, 0x3b, 0x69, 0x66, 0x28, - 0x2d, 0x31, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x69, 0x29, 0x7b, 0x65, 0x2e, - 0x53, 0x2e, 0x55, 0x28, 0x65, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e, 0x6e, - 0x3d, 0x65, 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x6e, 0x29, 0x65, 0x2e, 0x6e, - 0x2e, 0x70, 0x3d, 0x74, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x6e, 0x3d, - 0x65, 0x3b, 0x65, 0x2e, 0x53, 0x2e, 0x6e, 0x3d, 0x65, 0x2e, 0x72, 0x3b, - 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, - 0x65, 0x2e, 0x72, 0x29, 0x65, 0x2e, 0x72, 0x3d, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x3b, 0x65, 0x3d, 0x74, 0x7d, 0x74, 0x2e, 0x73, 0x3d, 0x6e, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x28, - 0x74, 0x29, 0x7b, 0x66, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, - 0x69, 0x73, 0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x29, 0x3b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x78, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x73, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x67, 0x3d, 0x75, 0x2d, 0x31, 0x3b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x66, 0x3d, 0x34, 0x7d, 0x28, 0x70, 0x2e, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x66, - 0x29, 0x2e, 0x68, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, - 0x33, 0x3b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x3b, 0x69, - 0x66, 0x28, 0x33, 0x32, 0x3d, 0x3d, 0x28, 0x33, 0x36, 0x26, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x21, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, - 0x35, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x3d, - 0x3d, 0x3d, 0x75, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, - 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x3d, 0x75, 0x3b, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3e, 0x30, 0x26, 0x26, 0x21, 0x63, 0x28, - 0x74, 0x68, 0x69, 0x73, 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x21, 0x30, 0x7d, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x69, - 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x68, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, - 0x3b, 0x69, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x3b, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x28, 0x29, - 0x3b, 0x69, 0x66, 0x28, 0x31, 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x7c, 0x7c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x21, 0x3d, 0x3d, - 0x74, 0x7c, 0x7c, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x69, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x31, 0x37, 0x3b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x7d, 0x7d, 0x63, 0x61, - 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, - 0x31, 0x36, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x7d, - 0x69, 0x3d, 0x74, 0x3b, 0x61, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x70, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, - 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x7c, 0x3d, 0x33, 0x36, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, - 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3b, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, - 0x74, 0x2e, 0x6e, 0x29, 0x74, 0x2e, 0x53, 0x2e, 0x53, 0x28, 0x74, 0x29, - 0x7d, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, - 0x2e, 0x53, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x2c, 0x74, 0x29, 0x7d, 0x3b, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x74, 0x79, 0x70, 0x65, 0x2e, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x74, 0x29, 0x7b, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, - 0x70, 0x65, 0x2e, 0x55, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, - 0x69, 0x73, 0x2c, 0x74, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, - 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, - 0x33, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x6e, 0x29, - 0x74, 0x2e, 0x53, 0x2e, 0x55, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x7d, 0x3b, - 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, - 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x32, 0x26, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x66, 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, - 0x3d, 0x36, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, - 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3b, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x78, - 0x29, 0x74, 0x2e, 0x74, 0x2e, 0x4e, 0x28, 0x29, 0x7d, 0x7d, 0x3b, 0x70, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, - 0x65, 0x65, 0x6b, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x68, 0x28, 0x29, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x31, - 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72, - 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, - 0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69, - 0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x70, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x22, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x7b, 0x67, 0x65, 0x74, 0x28, - 0x29, 0x7b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6e, 0x3d, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x68, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x29, 0x6e, 0x2e, 0x69, - 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x31, - 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72, - 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65, + 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3b, 0x63, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, 0x65, 0x65, + 0x6b, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, + 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x76, 0x7d, 0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, + 0x65, 0x66, 0x69, 0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, + 0x79, 0x28, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, + 0x65, 0x2c, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x7b, 0x67, + 0x65, 0x74, 0x28, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, + 0x3d, 0x73, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x66, 0x28, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, + 0x2e, 0x69, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, - 0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x64, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x6e, 0x65, 0x77, 0x20, 0x70, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x76, 0x28, 0x74, 0x29, 0x7b, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x74, 0x2e, 0x75, 0x3b, 0x74, - 0x2e, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x66, - 0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, - 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x29, 0x7b, 0x6f, - 0x2b, 0x2b, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x69, - 0x3b, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x72, - 0x79, 0x7b, 0x65, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, - 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x74, - 0x2e, 0x66, 0x7c, 0x3d, 0x38, 0x3b, 0x79, 0x28, 0x74, 0x29, 0x3b, 0x74, - 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, - 0x6c, 0x79, 0x7b, 0x69, 0x3d, 0x5f, 0x3b, 0x6e, 0x28, 0x29, 0x7d, 0x7d, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x79, 0x28, + 0x2c, 0x73, 0x65, 0x74, 0x28, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x69, + 0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x6f, 0x66, 0x20, + 0x76, 0x29, 0x21, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, + 0x29, 0x7b, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, + 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70, 0x75, + 0x74, 0x65, 0x64, 0x20, 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x20, 0x68, + 0x61, 0x76, 0x65, 0x20, 0x73, 0x69, 0x64, 0x65, 0x2d, 0x65, 0x66, 0x66, + 0x65, 0x63, 0x74, 0x73, 0x22, 0x29, 0x7d, 0x28, 0x29, 0x3b, 0x69, 0x66, + 0x28, 0x65, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x29, + 0x7b, 0x69, 0x66, 0x28, 0x66, 0x3e, 0x31, 0x30, 0x30, 0x29, 0x74, 0x28, + 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x65, 0x3b, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x3b, 0x6c, 0x2b, 0x2b, 0x3b, + 0x75, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x66, 0x6f, 0x72, 0x28, + 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, + 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, + 0x74, 0x3d, 0x74, 0x2e, 0x78, 0x29, 0x74, 0x2e, 0x74, 0x2e, 0x4e, 0x28, + 0x29, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x6e, 0x28, + 0x29, 0x7d, 0x7d, 0x7d, 0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x63, 0x28, 0x74, 0x29, + 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x61, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, - 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x6e, 0x2e, - 0x53, 0x2e, 0x55, 0x28, 0x6e, 0x29, 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x2e, 0x73, 0x3d, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x3b, 0x76, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6d, 0x28, 0x74, 0x29, 0x7b, - 0x69, 0x66, 0x28, 0x69, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x29, - 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, - 0x72, 0x6f, 0x72, 0x28, 0x22, 0x4f, 0x75, 0x74, 0x2d, 0x6f, 0x66, 0x2d, - 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, - 0x22, 0x29, 0x3b, 0x61, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, - 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, - 0x32, 0x3b, 0x69, 0x66, 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x29, 0x79, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x6e, 0x28, - 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x67, - 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x3d, 0x74, - 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3d, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6f, 0x3d, - 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x3d, 0x33, 0x32, 0x7d, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x74, 0x79, 0x70, 0x65, 0x2e, 0x63, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x53, 0x28, 0x29, 0x3b, 0x74, - 0x72, 0x79, 0x7b, 0x69, 0x66, 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x69, 0x66, + 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x69, 0x66, + 0x28, 0x6e, 0x2e, 0x53, 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x69, + 0x7c, 0x7c, 0x21, 0x6e, 0x2e, 0x53, 0x2e, 0x68, 0x28, 0x29, 0x7c, 0x7c, + 0x6e, 0x2e, 0x53, 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x69, 0x29, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x21, 0x31, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x20, 0x70, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, + 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76, 0x6f, + 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, + 0x2e, 0x6e, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, + 0x6e, 0x2e, 0x53, 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x6e, 0x2e, 0x72, 0x3d, + 0x65, 0x3b, 0x6e, 0x2e, 0x53, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x6e, 0x2e, + 0x69, 0x3d, 0x2d, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, + 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x7b, 0x74, 0x2e, + 0x73, 0x3d, 0x6e, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x7d, 0x7d, + 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x64, 0x28, 0x74, + 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x74, 0x2e, + 0x73, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, 0x64, + 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x74, 0x3d, 0x65, 0x2e, 0x70, 0x3b, 0x69, 0x66, 0x28, 0x2d, + 0x31, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x69, 0x29, 0x7b, 0x65, 0x2e, 0x53, + 0x2e, 0x55, 0x28, 0x65, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e, 0x6e, 0x3d, + 0x65, 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x6e, 0x29, 0x65, 0x2e, 0x6e, 0x2e, + 0x70, 0x3d, 0x74, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x6e, 0x3d, 0x65, + 0x3b, 0x65, 0x2e, 0x53, 0x2e, 0x6e, 0x3d, 0x65, 0x2e, 0x72, 0x3b, 0x69, + 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, + 0x2e, 0x72, 0x29, 0x65, 0x2e, 0x72, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x3b, 0x65, 0x3d, 0x74, 0x7d, 0x74, 0x2e, 0x73, 0x3d, 0x6e, 0x7d, + 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x76, 0x28, 0x74, + 0x29, 0x7b, 0x63, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, + 0x73, 0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x29, 0x3b, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x78, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x73, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x67, 0x3d, 0x6c, 0x2d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x66, 0x3d, 0x34, 0x7d, 0x28, 0x76, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x63, 0x29, + 0x2e, 0x68, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, + 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, + 0x3b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, + 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x3b, 0x69, 0x66, + 0x28, 0x33, 0x32, 0x3d, 0x3d, 0x28, 0x33, 0x36, 0x26, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x66, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, + 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x35, + 0x3b, 0x69, 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x3d, 0x3d, + 0x3d, 0x6c, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x3d, 0x6c, 0x3b, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x69, 0x3e, 0x30, 0x26, 0x26, 0x21, 0x61, 0x28, 0x74, + 0x68, 0x69, 0x73, 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, + 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, + 0x30, 0x7d, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x69, 0x3b, + 0x74, 0x72, 0x79, 0x7b, 0x70, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, + 0x69, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x28, 0x29, 0x3b, + 0x69, 0x66, 0x28, 0x31, 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, + 0x7c, 0x7c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x21, 0x3d, 0x3d, 0x74, + 0x7c, 0x7c, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, + 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x31, 0x37, 0x3b, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x7d, 0x7d, 0x63, 0x61, 0x74, + 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, + 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x31, + 0x36, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x7d, 0x69, + 0x3d, 0x74, 0x3b, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x76, 0x2e, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x78, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x78, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e, + 0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, + 0x7c, 0x3d, 0x33, 0x36, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, + 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3b, 0x76, 0x6f, + 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, + 0x2e, 0x6e, 0x29, 0x74, 0x2e, 0x53, 0x2e, 0x53, 0x28, 0x74, 0x29, 0x7d, + 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, + 0x53, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, + 0x74, 0x29, 0x7d, 0x3b, 0x76, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, + 0x79, 0x70, 0x65, 0x2e, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, + 0x29, 0x7b, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, + 0x65, 0x2e, 0x55, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, + 0x73, 0x2c, 0x74, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, + 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, + 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x33, + 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, + 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x6e, 0x29, 0x74, + 0x2e, 0x53, 0x2e, 0x55, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x7d, 0x3b, 0x76, + 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x4e, + 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, + 0x69, 0x66, 0x28, 0x21, 0x28, 0x32, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x66, 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, + 0x36, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x78, 0x29, + 0x74, 0x2e, 0x74, 0x2e, 0x4e, 0x28, 0x29, 0x7d, 0x7d, 0x3b, 0x76, 0x2e, + 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, 0x65, + 0x65, 0x6b, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, + 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x68, + 0x28, 0x29, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x31, 0x36, + 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72, 0x6f, + 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, 0x3b, + 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69, 0x6e, + 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x76, 0x2e, + 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x22, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x7b, 0x67, 0x65, 0x74, 0x28, 0x29, + 0x7b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, + 0x29, 0x74, 0x28, 0x29, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, + 0x3d, 0x73, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x68, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x29, 0x6e, 0x2e, 0x69, 0x3d, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x31, 0x36, + 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72, 0x6f, + 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, 0x7d, + 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x79, + 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, + 0x65, 0x77, 0x20, 0x76, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6d, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x74, 0x2e, 0x75, 0x3b, 0x74, 0x2e, + 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x66, 0x28, + 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, + 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x29, 0x7b, 0x75, 0x2b, + 0x2b, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x69, 0x3b, + 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x72, 0x79, + 0x7b, 0x65, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, + 0x29, 0x7b, 0x74, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x74, 0x2e, + 0x66, 0x7c, 0x3d, 0x38, 0x3b, 0x67, 0x28, 0x74, 0x29, 0x3b, 0x74, 0x68, + 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, + 0x79, 0x7b, 0x69, 0x3d, 0x5f, 0x3b, 0x6e, 0x28, 0x29, 0x7d, 0x7d, 0x7d, + 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x67, 0x28, 0x74, + 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, + 0x74, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, + 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x6e, 0x2e, 0x53, + 0x2e, 0x55, 0x28, 0x6e, 0x29, 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, 0x6f, + 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x2e, 0x73, 0x3d, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x3b, 0x6d, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x28, 0x74, 0x29, 0x7b, 0x69, + 0x66, 0x28, 0x69, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x29, 0x74, + 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, + 0x6f, 0x72, 0x28, 0x22, 0x4f, 0x75, 0x74, 0x2d, 0x6f, 0x66, 0x2d, 0x6f, + 0x72, 0x64, 0x65, 0x72, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x22, + 0x29, 0x3b, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x3d, + 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, + 0x3b, 0x69, 0x66, 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, + 0x29, 0x67, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x6e, 0x28, 0x29, + 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6b, 0x28, + 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x3d, 0x74, 0x3b, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3d, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6f, 0x3d, 0x76, + 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, + 0x3d, 0x33, 0x32, 0x7d, 0x6b, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, + 0x79, 0x70, 0x65, 0x2e, 0x63, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, + 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x53, 0x28, 0x29, 0x3b, 0x74, 0x72, + 0x79, 0x7b, 0x69, 0x66, 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x69, 0x66, 0x28, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x78, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x78, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, + 0x66, 0x20, 0x6e, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x75, 0x3d, 0x6e, + 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x28, 0x29, + 0x7d, 0x7d, 0x3b, 0x6b, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, + 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x66, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, + 0x26, 0x3d, 0x2d, 0x39, 0x3b, 0x6d, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, + 0x3b, 0x70, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x75, 0x2b, 0x2b, + 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x69, 0x3b, 0x69, + 0x3d, 0x74, 0x68, 0x69, 0x73, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, + 0x20, 0x62, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, + 0x2c, 0x6e, 0x29, 0x7d, 0x3b, 0x6b, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, + 0x74, 0x79, 0x70, 0x65, 0x2e, 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x32, + 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x29, 0x7b, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x32, 0x3b, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x6f, 0x3d, 0x5f, 0x3b, 0x5f, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x7d, + 0x7d, 0x3b, 0x6b, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, + 0x65, 0x2e, 0x64, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x28, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x38, + 0x3b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x66, 0x29, 0x29, 0x67, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x7d, + 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x53, 0x28, + 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x6e, + 0x65, 0x77, 0x20, 0x6b, 0x28, 0x74, 0x29, 0x3b, 0x74, 0x72, 0x79, 0x7b, + 0x6e, 0x2e, 0x63, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, + 0x74, 0x29, 0x7b, 0x6e, 0x2e, 0x64, 0x28, 0x29, 0x3b, 0x74, 0x68, 0x72, + 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x6e, 0x2e, 0x64, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x6e, 0x29, 0x7d, + 0x76, 0x61, 0x72, 0x20, 0x78, 0x2c, 0x77, 0x2c, 0x43, 0x2c, 0x45, 0x2c, + 0x55, 0x2c, 0x48, 0x2c, 0x4e, 0x2c, 0x50, 0x2c, 0x24, 0x2c, 0x44, 0x3d, + 0x7b, 0x7d, 0x2c, 0x54, 0x3d, 0x5b, 0x5d, 0x2c, 0x56, 0x3d, 0x2f, 0x61, + 0x63, 0x69, 0x74, 0x7c, 0x65, 0x78, 0x28, 0x3f, 0x3a, 0x73, 0x7c, 0x67, + 0x7c, 0x6e, 0x7c, 0x70, 0x7c, 0x24, 0x29, 0x7c, 0x72, 0x70, 0x68, 0x7c, + 0x67, 0x72, 0x69, 0x64, 0x7c, 0x6f, 0x77, 0x73, 0x7c, 0x6d, 0x6e, 0x63, + 0x7c, 0x6e, 0x74, 0x77, 0x7c, 0x69, 0x6e, 0x65, 0x5b, 0x63, 0x68, 0x5d, + 0x7c, 0x7a, 0x6f, 0x6f, 0x7c, 0x5e, 0x6f, 0x72, 0x64, 0x7c, 0x69, 0x74, + 0x65, 0x72, 0x61, 0x2f, 0x69, 0x2c, 0x41, 0x3d, 0x41, 0x72, 0x72, 0x61, + 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x3b, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x46, 0x28, 0x74, 0x2c, 0x6e, + 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x65, 0x20, + 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x74, 0x5b, 0x65, 0x5d, 0x3d, 0x6e, 0x5b, + 0x65, 0x5d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x7d, + 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x28, 0x74, + 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x70, 0x61, + 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x3b, 0x6e, 0x26, 0x26, + 0x6e, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x43, 0x68, 0x69, 0x6c, + 0x64, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x57, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, + 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x3d, 0x7b, + 0x7d, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x6e, + 0x29, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x69, 0x3d, + 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d, 0x3d, + 0x6f, 0x3f, 0x5f, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x72, 0x5b, 0x6f, + 0x5d, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x61, 0x72, + 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x3e, 0x32, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x63, 0x68, 0x69, + 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, + 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x33, + 0x3f, 0x78, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67, 0x75, + 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29, 0x2c, + 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, + 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x26, 0x26, 0x6e, 0x75, + 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6f, + 0x20, 0x69, 0x6e, 0x20, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x3d, 0x3d, 0x3d, 0x72, 0x5b, 0x6f, 0x5d, 0x26, 0x26, 0x28, 0x72, + 0x5b, 0x6f, 0x5d, 0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x6f, 0x5d, 0x29, 0x3b, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x4f, 0x28, 0x74, 0x2c, 0x72, 0x2c, + 0x69, 0x2c, 0x5f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4f, 0x28, 0x74, 0x2c, 0x6e, + 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, + 0x6f, 0x3d, 0x7b, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x74, 0x2c, 0x70, 0x72, + 0x6f, 0x70, 0x73, 0x3a, 0x6e, 0x2c, 0x6b, 0x65, 0x79, 0x3a, 0x65, 0x2c, + 0x72, 0x65, 0x66, 0x3a, 0x69, 0x2c, 0x5f, 0x5f, 0x6b, 0x3a, 0x6e, 0x75, + 0x6c, 0x6c, 0x2c, 0x5f, 0x5f, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5f, + 0x5f, 0x62, 0x3a, 0x30, 0x2c, 0x5f, 0x5f, 0x65, 0x3a, 0x6e, 0x75, 0x6c, + 0x6c, 0x2c, 0x5f, 0x5f, 0x64, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, + 0x2c, 0x5f, 0x5f, 0x63, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5f, 0x5f, + 0x68, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x2c, 0x5f, 0x5f, 0x76, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, + 0x5f, 0x3f, 0x2b, 0x2b, 0x43, 0x3a, 0x5f, 0x7d, 0x3b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x5f, 0x26, + 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x77, 0x2e, 0x76, 0x6e, 0x6f, + 0x64, 0x65, 0x26, 0x26, 0x77, 0x2e, 0x76, 0x6e, 0x6f, 0x64, 0x65, 0x28, + 0x6f, 0x29, 0x2c, 0x6f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x4c, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, + 0x7b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3a, 0x6e, 0x75, 0x6c, + 0x6c, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x52, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x49, 0x28, 0x74, 0x2c, + 0x6e, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, + 0x73, 0x3d, 0x74, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x78, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x6a, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x69, + 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6e, 0x29, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x3f, 0x6a, 0x28, + 0x74, 0x2e, 0x5f, 0x5f, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x2e, 0x5f, 0x5f, + 0x6b, 0x2e, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x74, 0x29, + 0x2b, 0x31, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x66, 0x6f, 0x72, + 0x28, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3b, 0x6e, 0x3c, 0x74, 0x2e, 0x5f, + 0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6e, 0x2b, + 0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, + 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x6e, 0x5d, 0x29, 0x26, + 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, + 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x5f, 0x5f, + 0x65, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, - 0x6f, 0x66, 0x20, 0x6e, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x75, 0x3d, - 0x6e, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x28, - 0x29, 0x7d, 0x7d, 0x3b, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, - 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x26, 0x3d, 0x2d, 0x39, 0x3b, 0x76, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x29, 0x3b, 0x68, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x6f, 0x2b, - 0x2b, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x69, 0x3b, - 0x69, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x6d, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68, 0x69, - 0x73, 0x2c, 0x6e, 0x29, 0x7d, 0x3b, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x28, - 0x32, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x29, 0x7b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x32, 0x3b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x6f, 0x3d, 0x5f, 0x3b, 0x5f, 0x3d, 0x74, 0x68, 0x69, 0x73, - 0x7d, 0x7d, 0x3b, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, - 0x70, 0x65, 0x2e, 0x64, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, - 0x38, 0x3b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x66, 0x29, 0x29, 0x79, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, - 0x7d, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, - 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, - 0x6e, 0x65, 0x77, 0x20, 0x67, 0x28, 0x74, 0x29, 0x3b, 0x74, 0x72, 0x79, - 0x7b, 0x6e, 0x2e, 0x63, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, - 0x28, 0x74, 0x29, 0x7b, 0x6e, 0x2e, 0x64, 0x28, 0x29, 0x3b, 0x74, 0x68, - 0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x6e, 0x2e, 0x64, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x6e, 0x29, - 0x7d, 0x76, 0x61, 0x72, 0x20, 0x6b, 0x2c, 0x53, 0x2c, 0x78, 0x2c, 0x77, - 0x2c, 0x43, 0x2c, 0x45, 0x2c, 0x55, 0x2c, 0x48, 0x2c, 0x4e, 0x2c, 0x50, - 0x3d, 0x7b, 0x7d, 0x2c, 0x44, 0x3d, 0x5b, 0x5d, 0x2c, 0x24, 0x3d, 0x2f, - 0x61, 0x63, 0x69, 0x74, 0x7c, 0x65, 0x78, 0x28, 0x3f, 0x3a, 0x73, 0x7c, - 0x67, 0x7c, 0x6e, 0x7c, 0x70, 0x7c, 0x24, 0x29, 0x7c, 0x72, 0x70, 0x68, - 0x7c, 0x67, 0x72, 0x69, 0x64, 0x7c, 0x6f, 0x77, 0x73, 0x7c, 0x6d, 0x6e, - 0x63, 0x7c, 0x6e, 0x74, 0x77, 0x7c, 0x69, 0x6e, 0x65, 0x5b, 0x63, 0x68, - 0x5d, 0x7c, 0x7a, 0x6f, 0x6f, 0x7c, 0x5e, 0x6f, 0x72, 0x64, 0x7c, 0x69, - 0x74, 0x65, 0x72, 0x61, 0x2f, 0x69, 0x2c, 0x54, 0x3d, 0x41, 0x72, 0x72, - 0x61, 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x3b, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x56, 0x28, 0x74, 0x2c, - 0x6e, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x65, - 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x74, 0x5b, 0x65, 0x5d, 0x3d, 0x6e, - 0x5b, 0x65, 0x5d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x41, 0x28, - 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x70, - 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x3b, 0x6e, 0x26, - 0x26, 0x6e, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x43, 0x68, 0x69, - 0x6c, 0x64, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x46, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x3d, - 0x7b, 0x7d, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, - 0x6e, 0x29, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x69, - 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d, - 0x3d, 0x6f, 0x3f, 0x5f, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x72, 0x5b, - 0x6f, 0x5d, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x61, - 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, - 0x67, 0x74, 0x68, 0x3e, 0x32, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x63, 0x68, - 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d, - 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, - 0x33, 0x3f, 0x6b, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67, - 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29, - 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, - 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x26, 0x26, 0x6e, - 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, - 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x66, 0x6f, 0x72, 0x28, - 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, - 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x72, 0x5b, 0x6f, 0x5d, 0x26, 0x26, 0x28, - 0x72, 0x5b, 0x6f, 0x5d, 0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, - 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x6f, 0x5d, 0x29, 0x3b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x4d, 0x28, 0x74, 0x2c, 0x72, - 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x28, 0x74, 0x2c, - 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61, 0x72, - 0x20, 0x6f, 0x3d, 0x7b, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x74, 0x2c, 0x70, - 0x72, 0x6f, 0x70, 0x73, 0x3a, 0x6e, 0x2c, 0x6b, 0x65, 0x79, 0x3a, 0x65, - 0x2c, 0x72, 0x65, 0x66, 0x3a, 0x69, 0x2c, 0x5f, 0x5f, 0x6b, 0x3a, 0x6e, - 0x75, 0x6c, 0x6c, 0x2c, 0x5f, 0x5f, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, - 0x5f, 0x5f, 0x62, 0x3a, 0x30, 0x2c, 0x5f, 0x5f, 0x65, 0x3a, 0x6e, 0x75, - 0x6c, 0x6c, 0x2c, 0x5f, 0x5f, 0x64, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x2c, 0x5f, 0x5f, 0x63, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5f, - 0x5f, 0x68, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3a, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x2c, 0x5f, 0x5f, 0x76, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, - 0x3d, 0x5f, 0x3f, 0x2b, 0x2b, 0x78, 0x3a, 0x5f, 0x7d, 0x3b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x5f, - 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x53, 0x2e, 0x76, 0x6e, - 0x6f, 0x64, 0x65, 0x26, 0x26, 0x53, 0x2e, 0x76, 0x6e, 0x6f, 0x64, 0x65, - 0x28, 0x6f, 0x29, 0x2c, 0x6f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x57, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x7b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3a, 0x6e, 0x75, - 0x6c, 0x6c, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x4f, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4c, 0x28, 0x74, - 0x2c, 0x6e, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, - 0x70, 0x73, 0x3d, 0x74, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x52, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, - 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6e, 0x29, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x3f, 0x52, - 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x2e, 0x5f, - 0x5f, 0x6b, 0x2e, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x74, - 0x29, 0x2b, 0x31, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x66, 0x6f, - 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3b, 0x6e, 0x3c, 0x74, 0x2e, + 0x6f, 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3f, 0x6a, 0x28, + 0x74, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x42, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, + 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, + 0x6c, 0x21, 0x3d, 0x28, 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x26, + 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, + 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, + 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x6e, + 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x3d, 0x30, 0x3b, 0x6e, 0x3c, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6e, 0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x6e, 0x5d, 0x29, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, - 0x65, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x5f, - 0x5f, 0x65, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, - 0x65, 0x6f, 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3f, 0x52, - 0x28, 0x74, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x49, 0x28, 0x74, 0x29, 0x7b, 0x76, - 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75, - 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x29, - 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, - 0x63, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x65, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, - 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x3d, 0x30, 0x3b, 0x6e, 0x3c, 0x74, - 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, - 0x6e, 0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, - 0x3d, 0x28, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x6e, 0x5d, - 0x29, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f, - 0x5f, 0x65, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x74, 0x2e, - 0x5f, 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x65, 0x2e, 0x5f, - 0x5f, 0x65, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x49, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6a, 0x28, 0x74, 0x29, 0x7b, - 0x28, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28, 0x74, 0x2e, - 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x30, 0x29, 0x26, 0x26, 0x43, 0x2e, 0x70, - 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x26, 0x26, 0x21, 0x71, 0x2e, 0x5f, - 0x5f, 0x72, 0x2b, 0x2b, 0x7c, 0x7c, 0x45, 0x21, 0x3d, 0x3d, 0x53, 0x2e, - 0x64, 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65, 0x6e, 0x64, - 0x65, 0x72, 0x69, 0x6e, 0x67, 0x29, 0x26, 0x26, 0x28, 0x28, 0x45, 0x3d, - 0x53, 0x2e, 0x64, 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65, - 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x29, 0x7c, 0x7c, 0x55, 0x29, - 0x28, 0x71, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x71, 0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x2c, 0x6e, - 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, - 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x43, 0x2e, 0x73, 0x6f, 0x72, 0x74, 0x28, - 0x48, 0x29, 0x3b, 0x74, 0x3d, 0x43, 0x2e, 0x73, 0x68, 0x69, 0x66, 0x74, - 0x28, 0x29, 0x3b, 0x29, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28, - 0x6e, 0x3d, 0x43, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x2c, 0x69, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x3d, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x2c, 0x72, 0x3d, 0x28, 0x6f, 0x3d, 0x28, 0x65, - 0x3d, 0x74, 0x29, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x2e, 0x5f, 0x5f, 0x65, - 0x2c, 0x28, 0x75, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x50, 0x29, 0x26, 0x26, - 0x28, 0x69, 0x3d, 0x5b, 0x5d, 0x2c, 0x28, 0x5f, 0x3d, 0x56, 0x28, 0x7b, - 0x7d, 0x2c, 0x6f, 0x29, 0x29, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x6f, 0x2e, - 0x5f, 0x5f, 0x76, 0x2b, 0x31, 0x2c, 0x6e, 0x74, 0x28, 0x75, 0x2c, 0x6f, - 0x2c, 0x5f, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x6e, 0x2c, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x75, 0x2e, 0x6f, 0x77, 0x6e, 0x65, - 0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x2e, 0x5f, 0x5f, 0x68, 0x3f, - 0x5b, 0x72, 0x5d, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x69, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x72, 0x3f, 0x52, 0x28, 0x6f, 0x29, 0x3a, - 0x72, 0x2c, 0x6f, 0x2e, 0x5f, 0x5f, 0x68, 0x29, 0x2c, 0x65, 0x74, 0x28, - 0x69, 0x2c, 0x6f, 0x29, 0x2c, 0x6f, 0x2e, 0x5f, 0x5f, 0x65, 0x21, 0x3d, - 0x72, 0x26, 0x26, 0x49, 0x28, 0x6f, 0x29, 0x29, 0x2c, 0x43, 0x2e, 0x6c, - 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x6e, 0x26, 0x26, 0x43, 0x2e, 0x73, - 0x6f, 0x72, 0x74, 0x28, 0x48, 0x29, 0x29, 0x3b, 0x71, 0x2e, 0x5f, 0x5f, - 0x72, 0x3d, 0x30, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x42, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, - 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x2c, 0x66, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x73, 0x2c, 0x63, 0x2c, 0x68, 0x2c, 0x61, 0x2c, - 0x70, 0x2c, 0x64, 0x2c, 0x76, 0x2c, 0x79, 0x3d, 0x69, 0x26, 0x26, 0x69, - 0x2e, 0x5f, 0x5f, 0x6b, 0x7c, 0x7c, 0x44, 0x2c, 0x6d, 0x3d, 0x79, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x65, - 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x5b, 0x5d, 0x2c, 0x73, 0x3d, 0x30, 0x3b, - 0x73, 0x3c, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x73, - 0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, - 0x28, 0x61, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x73, 0x5d, 0x3d, - 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x28, 0x61, 0x3d, 0x6e, 0x5b, 0x73, + 0x65, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x74, 0x2e, 0x5f, + 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, + 0x65, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x42, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x71, 0x28, 0x74, 0x29, 0x7b, 0x28, + 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, + 0x5f, 0x64, 0x3d, 0x21, 0x30, 0x29, 0x26, 0x26, 0x55, 0x2e, 0x70, 0x75, + 0x73, 0x68, 0x28, 0x74, 0x29, 0x26, 0x26, 0x21, 0x47, 0x2e, 0x5f, 0x5f, + 0x72, 0x2b, 0x2b, 0x7c, 0x7c, 0x48, 0x21, 0x3d, 0x3d, 0x77, 0x2e, 0x64, + 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65, 0x6e, 0x64, 0x65, + 0x72, 0x69, 0x6e, 0x67, 0x29, 0x26, 0x26, 0x28, 0x28, 0x48, 0x3d, 0x77, + 0x2e, 0x64, 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65, 0x6e, + 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x29, 0x7c, 0x7c, 0x4e, 0x29, 0x28, + 0x47, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, + 0x47, 0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x2c, 0x6e, 0x2c, + 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, + 0x66, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x55, 0x2e, 0x73, 0x6f, 0x72, 0x74, + 0x28, 0x50, 0x29, 0x3b, 0x74, 0x3d, 0x55, 0x2e, 0x73, 0x68, 0x69, 0x66, + 0x74, 0x28, 0x29, 0x3b, 0x29, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, + 0x28, 0x6e, 0x3d, 0x55, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x2c, + 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x3d, 0x76, + 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64, + 0x20, 0x30, 0x2c, 0x75, 0x3d, 0x28, 0x72, 0x3d, 0x28, 0x65, 0x3d, 0x74, + 0x29, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x28, + 0x66, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x50, 0x29, 0x26, 0x26, 0x28, 0x69, + 0x3d, 0x5b, 0x5d, 0x2c, 0x5f, 0x3d, 0x5b, 0x5d, 0x2c, 0x28, 0x6f, 0x3d, + 0x46, 0x28, 0x7b, 0x7d, 0x2c, 0x72, 0x29, 0x29, 0x2e, 0x5f, 0x5f, 0x76, + 0x3d, 0x72, 0x2e, 0x5f, 0x5f, 0x76, 0x2b, 0x31, 0x2c, 0x69, 0x74, 0x28, + 0x66, 0x2c, 0x72, 0x2c, 0x6f, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x6e, 0x2c, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x66, 0x2e, 0x6f, + 0x77, 0x6e, 0x65, 0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, + 0x6e, 0x74, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x72, 0x2e, 0x5f, + 0x5f, 0x68, 0x3f, 0x5b, 0x75, 0x5d, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, + 0x69, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x75, 0x3f, 0x6a, 0x28, + 0x72, 0x29, 0x3a, 0x75, 0x2c, 0x72, 0x2e, 0x5f, 0x5f, 0x68, 0x2c, 0x5f, + 0x29, 0x2c, 0x5f, 0x74, 0x28, 0x69, 0x2c, 0x72, 0x2c, 0x5f, 0x29, 0x2c, + 0x72, 0x2e, 0x5f, 0x5f, 0x65, 0x21, 0x3d, 0x75, 0x26, 0x26, 0x42, 0x28, + 0x72, 0x29, 0x29, 0x2c, 0x55, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, + 0x3e, 0x6e, 0x26, 0x26, 0x55, 0x2e, 0x73, 0x6f, 0x72, 0x74, 0x28, 0x50, + 0x29, 0x29, 0x3b, 0x47, 0x2e, 0x5f, 0x5f, 0x72, 0x3d, 0x30, 0x7d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x7a, 0x28, 0x74, 0x2c, + 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, + 0x75, 0x2c, 0x66, 0x2c, 0x6c, 0x2c, 0x73, 0x29, 0x7b, 0x76, 0x61, 0x72, + 0x20, 0x63, 0x2c, 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x2c, 0x64, 0x2c, 0x76, + 0x2c, 0x79, 0x2c, 0x6d, 0x2c, 0x67, 0x2c, 0x62, 0x2c, 0x6b, 0x3d, 0x30, + 0x2c, 0x53, 0x3d, 0x69, 0x26, 0x26, 0x69, 0x2e, 0x5f, 0x5f, 0x6b, 0x7c, + 0x7c, 0x54, 0x2c, 0x78, 0x3d, 0x53, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, + 0x68, 0x2c, 0x77, 0x3d, 0x78, 0x2c, 0x43, 0x3d, 0x6e, 0x2e, 0x6c, 0x65, + 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x65, 0x2e, 0x5f, + 0x5f, 0x6b, 0x3d, 0x5b, 0x5d, 0x2c, 0x63, 0x3d, 0x30, 0x3b, 0x63, 0x3c, + 0x43, 0x3b, 0x63, 0x2b, 0x2b, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, + 0x28, 0x70, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x63, 0x5d, 0x3d, + 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x28, 0x70, 0x3d, 0x6e, 0x5b, 0x63, 0x5d, 0x29, 0x7c, 0x7c, 0x22, 0x62, 0x6f, 0x6f, 0x6c, 0x65, 0x61, 0x6e, - 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x61, 0x7c, + 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x7c, 0x7c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, - 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x61, 0x3f, 0x6e, 0x75, + 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x3f, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, - 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x61, 0x7c, 0x7c, 0x22, + 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x7c, 0x7c, 0x22, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, - 0x65, 0x6f, 0x66, 0x20, 0x61, 0x7c, 0x7c, 0x22, 0x62, 0x69, 0x67, 0x69, + 0x65, 0x6f, 0x66, 0x20, 0x70, 0x7c, 0x7c, 0x22, 0x62, 0x69, 0x67, 0x69, 0x6e, 0x74, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, - 0x61, 0x3f, 0x4d, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x29, 0x3a, - 0x54, 0x28, 0x61, 0x29, 0x3f, 0x4d, 0x28, 0x4f, 0x2c, 0x7b, 0x63, 0x68, - 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x61, 0x7d, 0x2c, 0x6e, 0x75, + 0x70, 0x3f, 0x4f, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x70, 0x2c, 0x6e, + 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x70, 0x29, 0x3a, + 0x41, 0x28, 0x70, 0x29, 0x3f, 0x4f, 0x28, 0x52, 0x2c, 0x7b, 0x63, 0x68, + 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x70, 0x7d, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, - 0x29, 0x3a, 0x61, 0x2e, 0x5f, 0x5f, 0x62, 0x3e, 0x30, 0x3f, 0x4d, 0x28, - 0x61, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x61, 0x2e, 0x70, 0x72, 0x6f, - 0x70, 0x73, 0x2c, 0x61, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x61, 0x2e, 0x72, - 0x65, 0x66, 0x3f, 0x61, 0x2e, 0x72, 0x65, 0x66, 0x3a, 0x6e, 0x75, 0x6c, - 0x6c, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x3a, 0x61, 0x29, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x61, 0x2e, 0x5f, 0x5f, 0x3d, 0x65, 0x2c, 0x61, - 0x2e, 0x5f, 0x5f, 0x62, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x62, 0x2b, 0x31, - 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x28, 0x68, 0x3d, 0x79, - 0x5b, 0x73, 0x5d, 0x29, 0x7c, 0x7c, 0x68, 0x26, 0x26, 0x61, 0x2e, 0x6b, - 0x65, 0x79, 0x3d, 0x3d, 0x68, 0x2e, 0x6b, 0x65, 0x79, 0x26, 0x26, 0x61, - 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d, 0x3d, 0x68, 0x2e, 0x74, 0x79, - 0x70, 0x65, 0x29, 0x79, 0x5b, 0x73, 0x5d, 0x3d, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x28, - 0x63, 0x3d, 0x30, 0x3b, 0x63, 0x3c, 0x6d, 0x3b, 0x63, 0x2b, 0x2b, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x28, 0x68, 0x3d, 0x79, 0x5b, 0x63, 0x5d, 0x29, - 0x26, 0x26, 0x61, 0x2e, 0x6b, 0x65, 0x79, 0x3d, 0x3d, 0x68, 0x2e, 0x6b, - 0x65, 0x79, 0x26, 0x26, 0x61, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d, - 0x3d, 0x68, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x7b, 0x79, 0x5b, 0x63, - 0x5d, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x62, 0x72, 0x65, - 0x61, 0x6b, 0x7d, 0x68, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x6e, 0x74, - 0x28, 0x74, 0x2c, 0x61, 0x2c, 0x68, 0x3d, 0x68, 0x7c, 0x7c, 0x50, 0x2c, - 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x2c, 0x66, 0x29, - 0x2c, 0x70, 0x3d, 0x61, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x28, 0x63, 0x3d, - 0x61, 0x2e, 0x72, 0x65, 0x66, 0x29, 0x26, 0x26, 0x68, 0x2e, 0x72, 0x65, - 0x66, 0x21, 0x3d, 0x63, 0x26, 0x26, 0x28, 0x76, 0x7c, 0x7c, 0x28, 0x76, - 0x3d, 0x5b, 0x5d, 0x29, 0x2c, 0x68, 0x2e, 0x72, 0x65, 0x66, 0x26, 0x26, - 0x76, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x68, 0x2e, 0x72, 0x65, 0x66, - 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x29, 0x2c, 0x76, 0x2e, 0x70, - 0x75, 0x73, 0x68, 0x28, 0x63, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x63, 0x7c, - 0x7c, 0x70, 0x2c, 0x61, 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, - 0x3d, 0x70, 0x3f, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x64, 0x26, - 0x26, 0x28, 0x64, 0x3d, 0x70, 0x29, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, - 0x66, 0x20, 0x61, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x61, 0x2e, - 0x5f, 0x5f, 0x6b, 0x3d, 0x3d, 0x3d, 0x68, 0x2e, 0x5f, 0x5f, 0x6b, 0x3f, - 0x61, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x6c, 0x3d, 0x47, 0x28, 0x61, 0x2c, - 0x6c, 0x2c, 0x74, 0x29, 0x3a, 0x6c, 0x3d, 0x4a, 0x28, 0x74, 0x2c, 0x61, - 0x2c, 0x68, 0x2c, 0x79, 0x2c, 0x70, 0x2c, 0x6c, 0x29, 0x2c, 0x22, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, - 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, - 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x6c, 0x29, 0x29, 0x3a, - 0x6c, 0x26, 0x26, 0x68, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x3d, 0x6c, 0x26, - 0x26, 0x6c, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, - 0x65, 0x21, 0x3d, 0x74, 0x26, 0x26, 0x28, 0x6c, 0x3d, 0x52, 0x28, 0x68, - 0x29, 0x29, 0x7d, 0x66, 0x6f, 0x72, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x65, - 0x3d, 0x64, 0x2c, 0x73, 0x3d, 0x6d, 0x3b, 0x73, 0x2d, 0x2d, 0x3b, 0x29, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x79, 0x5b, 0x73, 0x5d, 0x26, 0x26, - 0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, - 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x2e, 0x74, 0x79, - 0x70, 0x65, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x79, 0x5b, - 0x73, 0x5d, 0x2e, 0x5f, 0x5f, 0x65, 0x26, 0x26, 0x79, 0x5b, 0x73, 0x5d, - 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x26, - 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x4b, 0x28, 0x69, 0x29, - 0x2e, 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, - 0x29, 0x2c, 0x6f, 0x74, 0x28, 0x79, 0x5b, 0x73, 0x5d, 0x2c, 0x79, 0x5b, - 0x73, 0x5d, 0x29, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x29, 0x66, 0x6f, - 0x72, 0x28, 0x73, 0x3d, 0x30, 0x3b, 0x73, 0x3c, 0x76, 0x2e, 0x6c, 0x65, - 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x73, 0x2b, 0x2b, 0x29, 0x5f, 0x74, 0x28, - 0x76, 0x5b, 0x73, 0x5d, 0x2c, 0x76, 0x5b, 0x2b, 0x2b, 0x73, 0x5d, 0x2c, - 0x76, 0x5b, 0x2b, 0x2b, 0x73, 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x47, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, - 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x69, 0x2c, - 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6f, 0x3d, 0x30, 0x3b, - 0x5f, 0x26, 0x26, 0x6f, 0x3c, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x3b, 0x6f, 0x2b, 0x2b, 0x29, 0x28, 0x69, 0x3d, 0x5f, 0x5b, 0x6f, - 0x5d, 0x29, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, - 0x6e, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, - 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x2e, 0x74, - 0x79, 0x70, 0x65, 0x3f, 0x47, 0x28, 0x69, 0x2c, 0x6e, 0x2c, 0x65, 0x29, - 0x3a, 0x4a, 0x28, 0x65, 0x2c, 0x69, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x69, - 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x29, 0x29, 0x3b, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x7a, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x3d, 0x6e, 0x7c, 0x7c, 0x5b, 0x5d, - 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x74, 0x7c, 0x7c, 0x22, 0x62, - 0x6f, 0x6f, 0x6c, 0x65, 0x61, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, - 0x65, 0x6f, 0x66, 0x20, 0x74, 0x7c, 0x7c, 0x28, 0x54, 0x28, 0x74, 0x29, - 0x3f, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x7a, 0x28, 0x74, - 0x2c, 0x6e, 0x29, 0x7d, 0x29, 0x29, 0x3a, 0x6e, 0x2e, 0x70, 0x75, 0x73, - 0x68, 0x28, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4a, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, - 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, - 0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x64, 0x29, - 0x72, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x64, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, - 0x64, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x65, 0x6c, 0x73, - 0x65, 0x20, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, - 0x7c, 0x7c, 0x5f, 0x21, 0x3d, 0x6f, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, - 0x3d, 0x3d, 0x5f, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, - 0x64, 0x65, 0x29, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, - 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x6f, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, - 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e, - 0x61, 0x70, 0x70, 0x65, 0x6e, 0x64, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x28, - 0x5f, 0x29, 0x2c, 0x72, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x65, 0x6c, - 0x73, 0x65, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x75, 0x3d, 0x6f, 0x2c, 0x6c, - 0x3d, 0x30, 0x3b, 0x28, 0x75, 0x3d, 0x75, 0x2e, 0x6e, 0x65, 0x78, 0x74, - 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, 0x29, 0x26, 0x26, 0x6c, 0x3c, - 0x69, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6c, 0x2b, 0x3d, - 0x31, 0x29, 0x69, 0x66, 0x28, 0x75, 0x3d, 0x3d, 0x5f, 0x29, 0x62, 0x72, - 0x65, 0x61, 0x6b, 0x20, 0x74, 0x3b, 0x74, 0x2e, 0x69, 0x6e, 0x73, 0x65, - 0x72, 0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, 0x65, 0x28, 0x5f, 0x2c, 0x6f, - 0x29, 0x2c, 0x72, 0x3d, 0x6f, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x72, 0x3f, - 0x72, 0x3a, 0x5f, 0x2e, 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, - 0x69, 0x6e, 0x67, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x4b, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, - 0x65, 0x2c, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, - 0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x7c, 0x7c, 0x22, 0x73, 0x74, - 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, - 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3b, 0x69, 0x66, - 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6e, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x2d, 0x31, 0x3b, 0x6e, 0x3e, 0x3d, 0x30, 0x3b, 0x6e, 0x2d, 0x2d, - 0x29, 0x69, 0x66, 0x28, 0x28, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, - 0x5b, 0x6e, 0x5d, 0x29, 0x26, 0x26, 0x28, 0x69, 0x3d, 0x4b, 0x28, 0x65, - 0x29, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69, 0x3b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x51, 0x28, 0x74, - 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61, - 0x72, 0x20, 0x6f, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, - 0x20, 0x65, 0x29, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, - 0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x6b, 0x65, 0x79, 0x22, - 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x6e, - 0x7c, 0x7c, 0x59, 0x28, 0x74, 0x2c, 0x6f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, - 0x2c, 0x65, 0x5b, 0x6f, 0x5d, 0x2c, 0x69, 0x29, 0x3b, 0x66, 0x6f, 0x72, - 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x5f, 0x26, 0x26, 0x22, + 0x29, 0x3a, 0x70, 0x2e, 0x5f, 0x5f, 0x62, 0x3e, 0x30, 0x3f, 0x4f, 0x28, + 0x70, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x70, 0x2e, 0x70, 0x72, 0x6f, + 0x70, 0x73, 0x2c, 0x70, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x70, 0x2e, 0x72, + 0x65, 0x66, 0x3f, 0x70, 0x2e, 0x72, 0x65, 0x66, 0x3a, 0x6e, 0x75, 0x6c, + 0x6c, 0x2c, 0x70, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x3a, 0x70, 0x29, 0x26, + 0x26, 0x28, 0x70, 0x2e, 0x5f, 0x5f, 0x3d, 0x65, 0x2c, 0x70, 0x2e, 0x5f, + 0x5f, 0x62, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x62, 0x2b, 0x31, 0x2c, 0x2d, + 0x31, 0x3d, 0x3d, 0x3d, 0x28, 0x6d, 0x3d, 0x58, 0x28, 0x70, 0x2c, 0x53, + 0x2c, 0x79, 0x3d, 0x63, 0x2b, 0x6b, 0x2c, 0x77, 0x29, 0x29, 0x3f, 0x61, + 0x3d, 0x44, 0x3a, 0x28, 0x61, 0x3d, 0x53, 0x5b, 0x6d, 0x5d, 0x7c, 0x7c, + 0x44, 0x2c, 0x53, 0x5b, 0x6d, 0x5d, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x2c, 0x77, 0x2d, 0x2d, 0x29, 0x2c, 0x69, 0x74, 0x28, 0x74, 0x2c, + 0x70, 0x2c, 0x61, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, + 0x66, 0x2c, 0x6c, 0x2c, 0x73, 0x29, 0x2c, 0x64, 0x3d, 0x70, 0x2e, 0x5f, + 0x5f, 0x65, 0x2c, 0x28, 0x68, 0x3d, 0x70, 0x2e, 0x72, 0x65, 0x66, 0x29, + 0x26, 0x26, 0x61, 0x2e, 0x72, 0x65, 0x66, 0x21, 0x3d, 0x68, 0x26, 0x26, + 0x28, 0x61, 0x2e, 0x72, 0x65, 0x66, 0x26, 0x26, 0x72, 0x74, 0x28, 0x61, + 0x2e, 0x72, 0x65, 0x66, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x70, 0x29, + 0x2c, 0x73, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x68, 0x2c, 0x70, 0x2e, + 0x5f, 0x5f, 0x63, 0x7c, 0x7c, 0x64, 0x2c, 0x70, 0x29, 0x29, 0x2c, 0x6e, + 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x64, 0x26, 0x26, 0x28, 0x6e, 0x75, 0x6c, + 0x6c, 0x3d, 0x3d, 0x76, 0x26, 0x26, 0x28, 0x76, 0x3d, 0x64, 0x29, 0x2c, + 0x62, 0x3d, 0x21, 0x28, 0x67, 0x3d, 0x61, 0x3d, 0x3d, 0x3d, 0x44, 0x7c, + 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x61, 0x2e, 0x5f, 0x5f, + 0x76, 0x29, 0x26, 0x26, 0x6d, 0x3d, 0x3d, 0x3d, 0x79, 0x2c, 0x67, 0x3f, + 0x2d, 0x31, 0x3d, 0x3d, 0x6d, 0x26, 0x26, 0x6b, 0x2d, 0x2d, 0x3a, 0x6d, + 0x21, 0x3d, 0x3d, 0x79, 0x26, 0x26, 0x28, 0x6d, 0x3d, 0x3d, 0x3d, 0x79, + 0x2b, 0x31, 0x3f, 0x28, 0x6b, 0x2b, 0x2b, 0x2c, 0x62, 0x3d, 0x21, 0x30, + 0x29, 0x3a, 0x6d, 0x3e, 0x79, 0x3f, 0x77, 0x3e, 0x43, 0x2d, 0x79, 0x3f, + 0x28, 0x6b, 0x2b, 0x3d, 0x6d, 0x2d, 0x79, 0x2c, 0x62, 0x3d, 0x21, 0x30, + 0x29, 0x3a, 0x6b, 0x2d, 0x2d, 0x3a, 0x6b, 0x3d, 0x6d, 0x3c, 0x79, 0x26, + 0x26, 0x6d, 0x3d, 0x3d, 0x79, 0x2d, 0x31, 0x3f, 0x6d, 0x2d, 0x79, 0x3a, + 0x30, 0x29, 0x2c, 0x79, 0x3d, 0x63, 0x2b, 0x6b, 0x2c, 0x62, 0x3d, 0x62, + 0x7c, 0x7c, 0x6d, 0x3d, 0x3d, 0x63, 0x26, 0x26, 0x21, 0x67, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, - 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x5b, 0x6f, 0x5d, 0x7c, 0x7c, - 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d, - 0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x3d, - 0x6f, 0x7c, 0x7c, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x3d, 0x3d, - 0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, - 0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x65, 0x5b, 0x6f, 0x5d, 0x3d, - 0x3d, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x7c, 0x7c, 0x59, 0x28, 0x74, 0x2c, - 0x6f, 0x2c, 0x6e, 0x5b, 0x6f, 0x5d, 0x2c, 0x65, 0x5b, 0x6f, 0x5d, 0x2c, - 0x69, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x58, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x22, 0x2d, 0x22, - 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x30, 0x5d, 0x3f, 0x74, 0x2e, 0x73, 0x65, - 0x74, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x6e, 0x2c, - 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x22, 0x22, 0x3a, 0x65, - 0x29, 0x3a, 0x74, 0x5b, 0x6e, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, - 0x3d, 0x65, 0x3f, 0x22, 0x22, 0x3a, 0x22, 0x6e, 0x75, 0x6d, 0x62, 0x65, - 0x72, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, - 0x7c, 0x7c, 0x24, 0x2e, 0x74, 0x65, 0x73, 0x74, 0x28, 0x6e, 0x29, 0x3f, - 0x65, 0x3a, 0x65, 0x2b, 0x22, 0x70, 0x78, 0x22, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x59, 0x28, 0x74, 0x2c, 0x6e, 0x2c, - 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6f, - 0x3b, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x79, 0x6c, 0x65, - 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x29, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, - 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, - 0x66, 0x20, 0x65, 0x29, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2e, - 0x63, 0x73, 0x73, 0x54, 0x65, 0x78, 0x74, 0x3d, 0x65, 0x3b, 0x65, 0x6c, - 0x73, 0x65, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, - 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, - 0x26, 0x26, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2e, 0x63, - 0x73, 0x73, 0x54, 0x65, 0x78, 0x74, 0x3d, 0x69, 0x3d, 0x22, 0x22, 0x29, - 0x2c, 0x69, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20, - 0x69, 0x29, 0x65, 0x26, 0x26, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x7c, - 0x7c, 0x58, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2c, 0x6e, - 0x2c, 0x22, 0x22, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x65, 0x29, 0x66, 0x6f, - 0x72, 0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x29, 0x69, 0x26, 0x26, - 0x65, 0x5b, 0x6e, 0x5d, 0x3d, 0x3d, 0x3d, 0x69, 0x5b, 0x6e, 0x5d, 0x7c, - 0x7c, 0x58, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2c, 0x6e, - 0x2c, 0x65, 0x5b, 0x6e, 0x5d, 0x29, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, - 0x69, 0x66, 0x28, 0x22, 0x6f, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x30, - 0x5d, 0x26, 0x26, 0x22, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x31, - 0x5d, 0x29, 0x6f, 0x3d, 0x6e, 0x21, 0x3d, 0x3d, 0x28, 0x6e, 0x3d, 0x6e, - 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x43, 0x61, - 0x70, 0x74, 0x75, 0x72, 0x65, 0x24, 0x2f, 0x2c, 0x22, 0x22, 0x29, 0x29, - 0x2c, 0x6e, 0x3d, 0x6e, 0x2e, 0x74, 0x6f, 0x4c, 0x6f, 0x77, 0x65, 0x72, - 0x43, 0x61, 0x73, 0x65, 0x28, 0x29, 0x69, 0x6e, 0x20, 0x74, 0x3f, 0x6e, - 0x2e, 0x74, 0x6f, 0x4c, 0x6f, 0x77, 0x65, 0x72, 0x43, 0x61, 0x73, 0x65, - 0x28, 0x29, 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x32, 0x29, 0x3a, - 0x6e, 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x32, 0x29, 0x2c, 0x74, - 0x2e, 0x6c, 0x7c, 0x7c, 0x28, 0x74, 0x2e, 0x6c, 0x3d, 0x7b, 0x7d, 0x29, - 0x2c, 0x74, 0x2e, 0x6c, 0x5b, 0x6e, 0x2b, 0x6f, 0x5d, 0x3d, 0x65, 0x2c, - 0x65, 0x3f, 0x69, 0x7c, 0x7c, 0x74, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, - 0x6e, 0x2c, 0x6f, 0x3f, 0x74, 0x74, 0x3a, 0x5a, 0x2c, 0x6f, 0x29, 0x3a, - 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x76, 0x65, 0x6e, - 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x6e, 0x2c, - 0x6f, 0x3f, 0x74, 0x74, 0x3a, 0x5a, 0x2c, 0x6f, 0x29, 0x3b, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x22, 0x64, 0x61, 0x6e, 0x67, 0x65, - 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, - 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x5f, 0x29, 0x6e, 0x3d, 0x6e, 0x2e, 0x72, 0x65, - 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x78, 0x6c, 0x69, 0x6e, 0x6b, - 0x28, 0x48, 0x7c, 0x3a, 0x68, 0x29, 0x2f, 0x2c, 0x22, 0x68, 0x22, 0x29, - 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x73, 0x4e, - 0x61, 0x6d, 0x65, 0x24, 0x2f, 0x2c, 0x22, 0x73, 0x22, 0x29, 0x3b, 0x65, - 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x22, 0x77, 0x69, 0x64, 0x74, - 0x68, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x68, 0x65, 0x69, - 0x67, 0x68, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x68, - 0x72, 0x65, 0x66, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x6c, - 0x69, 0x73, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x66, - 0x6f, 0x72, 0x6d, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x74, - 0x61, 0x62, 0x49, 0x6e, 0x64, 0x65, 0x78, 0x22, 0x21, 0x3d, 0x3d, 0x6e, - 0x26, 0x26, 0x22, 0x64, 0x6f, 0x77, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x22, - 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x72, 0x6f, 0x77, 0x53, 0x70, - 0x61, 0x6e, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x63, 0x6f, - 0x6c, 0x53, 0x70, 0x61, 0x6e, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, - 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x74, - 0x5b, 0x6e, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, - 0x22, 0x22, 0x3a, 0x65, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x74, - 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x7d, 0x22, + 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x2e, 0x74, 0x79, 0x70, 0x65, + 0x7c, 0x7c, 0x6d, 0x3d, 0x3d, 0x3d, 0x79, 0x26, 0x26, 0x61, 0x2e, 0x5f, + 0x5f, 0x6b, 0x21, 0x3d, 0x3d, 0x70, 0x2e, 0x5f, 0x5f, 0x6b, 0x3f, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, - 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x7c, 0x7c, 0x28, 0x6e, 0x75, - 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x7c, 0x7c, 0x21, 0x31, 0x3d, 0x3d, 0x3d, - 0x65, 0x26, 0x26, 0x22, 0x2d, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x5b, 0x34, - 0x5d, 0x3f, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x41, 0x74, - 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x29, 0x3a, 0x74, - 0x2e, 0x73, 0x65, 0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, - 0x65, 0x28, 0x6e, 0x2c, 0x65, 0x29, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x5a, 0x28, 0x74, 0x29, 0x7b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, - 0x5b, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x31, 0x5d, 0x28, - 0x53, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x3f, 0x53, 0x2e, 0x65, 0x76, - 0x65, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x74, 0x28, 0x74, 0x29, - 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x6c, 0x5b, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x30, - 0x5d, 0x28, 0x53, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x3f, 0x53, 0x2e, - 0x65, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6e, 0x74, 0x28, - 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, - 0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x66, - 0x2c, 0x73, 0x2c, 0x63, 0x2c, 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x2c, 0x64, - 0x2c, 0x76, 0x2c, 0x79, 0x2c, 0x6d, 0x2c, 0x67, 0x2c, 0x62, 0x2c, 0x6b, - 0x2c, 0x78, 0x2c, 0x77, 0x2c, 0x43, 0x3d, 0x6e, 0x2e, 0x74, 0x79, 0x70, - 0x65, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, - 0x3d, 0x3d, 0x6e, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, - 0x74, 0x6f, 0x72, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, - 0x75, 0x6c, 0x6c, 0x3b, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, - 0x5f, 0x5f, 0x68, 0x26, 0x26, 0x28, 0x6c, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, - 0x68, 0x2c, 0x75, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, 0x2e, - 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x6e, 0x75, - 0x6c, 0x6c, 0x2c, 0x6f, 0x3d, 0x5b, 0x75, 0x5d, 0x29, 0x2c, 0x28, 0x66, - 0x3d, 0x53, 0x2e, 0x5f, 0x5f, 0x62, 0x29, 0x26, 0x26, 0x66, 0x28, 0x6e, - 0x29, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x22, + 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x2e, 0x74, 0x79, 0x70, 0x65, + 0x7c, 0x7c, 0x62, 0x3f, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, + 0x3d, 0x70, 0x2e, 0x5f, 0x5f, 0x64, 0x3f, 0x28, 0x66, 0x3d, 0x70, 0x2e, + 0x5f, 0x5f, 0x64, 0x2c, 0x70, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, 0x6f, + 0x69, 0x64, 0x20, 0x30, 0x29, 0x3a, 0x66, 0x3d, 0x64, 0x2e, 0x6e, 0x65, + 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, 0x3a, 0x66, 0x3d, + 0x51, 0x28, 0x74, 0x2c, 0x64, 0x2c, 0x66, 0x29, 0x3a, 0x66, 0x3d, 0x4a, + 0x28, 0x70, 0x2c, 0x66, 0x2c, 0x74, 0x29, 0x2c, 0x22, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, + 0x6f, 0x66, 0x20, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x28, + 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x66, 0x29, 0x29, 0x29, 0x3b, 0x66, + 0x6f, 0x72, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x76, 0x2c, 0x63, + 0x3d, 0x78, 0x3b, 0x63, 0x2d, 0x2d, 0x3b, 0x29, 0x6e, 0x75, 0x6c, 0x6c, + 0x21, 0x3d, 0x53, 0x5b, 0x63, 0x5d, 0x26, 0x26, 0x28, 0x22, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, + 0x65, 0x6f, 0x66, 0x20, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, + 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x53, 0x5b, 0x63, 0x5d, 0x2e, 0x5f, + 0x5f, 0x65, 0x26, 0x26, 0x53, 0x5b, 0x63, 0x5d, 0x2e, 0x5f, 0x5f, 0x65, + 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28, 0x65, 0x2e, + 0x5f, 0x5f, 0x64, 0x3d, 0x53, 0x5b, 0x63, 0x5d, 0x2e, 0x5f, 0x5f, 0x65, + 0x2e, 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, + 0x29, 0x2c, 0x75, 0x74, 0x28, 0x53, 0x5b, 0x63, 0x5d, 0x2c, 0x53, 0x5b, + 0x63, 0x5d, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x4a, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x66, + 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, 0x3d, 0x74, + 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6f, 0x3d, 0x30, 0x3b, 0x5f, 0x26, 0x26, + 0x6f, 0x3c, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6f, + 0x2b, 0x2b, 0x29, 0x28, 0x69, 0x3d, 0x5f, 0x5b, 0x6f, 0x5d, 0x29, 0x26, + 0x26, 0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x6e, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, - 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x43, 0x29, 0x7b, 0x69, 0x66, 0x28, - 0x76, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x79, 0x3d, - 0x28, 0x66, 0x3d, 0x43, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, - 0x54, 0x79, 0x70, 0x65, 0x29, 0x26, 0x26, 0x69, 0x5b, 0x66, 0x2e, 0x5f, - 0x5f, 0x63, 0x5d, 0x2c, 0x6d, 0x3d, 0x66, 0x3f, 0x79, 0x3f, 0x79, 0x2e, - 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, - 0x66, 0x2e, 0x5f, 0x5f, 0x3a, 0x69, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x63, - 0x3f, 0x64, 0x3d, 0x28, 0x73, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, - 0x65, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x3d, 0x73, 0x2e, - 0x5f, 0x5f, 0x45, 0x3a, 0x28, 0x22, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, - 0x79, 0x70, 0x65, 0x22, 0x69, 0x6e, 0x20, 0x43, 0x26, 0x26, 0x43, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65, - 0x6e, 0x64, 0x65, 0x72, 0x3f, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x73, - 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x28, 0x76, 0x2c, 0x6d, 0x29, 0x3a, - 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x73, 0x3d, 0x6e, 0x65, 0x77, - 0x20, 0x4c, 0x28, 0x76, 0x2c, 0x6d, 0x29, 0x2c, 0x73, 0x2e, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3d, 0x43, 0x2c, - 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x3d, 0x72, 0x74, 0x29, - 0x2c, 0x79, 0x26, 0x26, 0x79, 0x2e, 0x73, 0x75, 0x62, 0x28, 0x73, 0x29, - 0x2c, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x76, 0x2c, 0x73, - 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, 0x73, 0x2e, 0x73, - 0x74, 0x61, 0x74, 0x65, 0x3d, 0x7b, 0x7d, 0x29, 0x2c, 0x73, 0x2e, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x6d, 0x2c, 0x73, 0x2e, 0x5f, - 0x5f, 0x6e, 0x3d, 0x69, 0x2c, 0x63, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x64, - 0x3d, 0x21, 0x30, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, - 0x2c, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x29, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, - 0x28, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x73, 0x2e, 0x73, 0x74, 0x61, - 0x74, 0x65, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x43, 0x2e, - 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, - 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, - 0x26, 0x26, 0x28, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x3d, 0x73, 0x2e, - 0x73, 0x74, 0x61, 0x74, 0x65, 0x26, 0x26, 0x28, 0x73, 0x2e, 0x5f, 0x5f, - 0x73, 0x3d, 0x56, 0x28, 0x7b, 0x7d, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x73, - 0x29, 0x29, 0x2c, 0x56, 0x28, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x43, + 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x2e, 0x74, 0x79, 0x70, 0x65, + 0x3f, 0x4a, 0x28, 0x69, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x3a, 0x51, 0x28, + 0x65, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x29, 0x29, 0x3b, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4b, 0x28, 0x74, 0x2c, 0x6e, 0x29, + 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x3d, 0x6e, 0x7c, + 0x7c, 0x5b, 0x5d, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x74, 0x7c, + 0x7c, 0x22, 0x62, 0x6f, 0x6f, 0x6c, 0x65, 0x61, 0x6e, 0x22, 0x3d, 0x3d, + 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x7c, 0x7c, 0x28, 0x41, + 0x28, 0x74, 0x29, 0x3f, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, + 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, + 0x4b, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7d, 0x29, 0x29, 0x3a, 0x6e, 0x2e, + 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x7d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x51, 0x28, 0x74, 0x2c, + 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x7c, 0x7c, 0x65, 0x2e, 0x70, + 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x21, 0x3d, 0x3d, + 0x74, 0x3f, 0x74, 0x2e, 0x69, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x42, 0x65, + 0x66, 0x6f, 0x72, 0x65, 0x28, 0x6e, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, + 0x3a, 0x6e, 0x3d, 0x3d, 0x65, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, + 0x3d, 0x6e, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, + 0x65, 0x7c, 0x7c, 0x74, 0x2e, 0x69, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x42, + 0x65, 0x66, 0x6f, 0x72, 0x65, 0x28, 0x6e, 0x2c, 0x65, 0x29, 0x2c, 0x6e, + 0x2e, 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, + 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x58, 0x28, + 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7b, 0x76, 0x61, 0x72, + 0x20, 0x5f, 0x3d, 0x74, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x6f, 0x3d, 0x74, + 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x72, 0x3d, 0x65, 0x2d, 0x31, 0x2c, + 0x75, 0x3d, 0x65, 0x2b, 0x31, 0x2c, 0x66, 0x3d, 0x6e, 0x5b, 0x65, 0x5d, + 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x66, + 0x7c, 0x7c, 0x66, 0x26, 0x26, 0x5f, 0x3d, 0x3d, 0x66, 0x2e, 0x6b, 0x65, + 0x79, 0x26, 0x26, 0x6f, 0x3d, 0x3d, 0x3d, 0x66, 0x2e, 0x74, 0x79, 0x70, + 0x65, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x3b, 0x69, + 0x66, 0x28, 0x69, 0x3e, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x66, + 0x3f, 0x31, 0x3a, 0x30, 0x29, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x3b, 0x72, + 0x3e, 0x3d, 0x30, 0x7c, 0x7c, 0x75, 0x3c, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x3b, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x72, 0x3e, 0x3d, + 0x30, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x28, 0x66, 0x3d, 0x6e, 0x5b, 0x72, + 0x5d, 0x29, 0x26, 0x26, 0x5f, 0x3d, 0x3d, 0x66, 0x2e, 0x6b, 0x65, 0x79, + 0x26, 0x26, 0x6f, 0x3d, 0x3d, 0x3d, 0x66, 0x2e, 0x74, 0x79, 0x70, 0x65, + 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x72, 0x2d, + 0x2d, 0x7d, 0x69, 0x66, 0x28, 0x75, 0x3c, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x28, 0x66, 0x3d, 0x6e, + 0x5b, 0x75, 0x5d, 0x29, 0x26, 0x26, 0x5f, 0x3d, 0x3d, 0x66, 0x2e, 0x6b, + 0x65, 0x79, 0x26, 0x26, 0x6f, 0x3d, 0x3d, 0x3d, 0x66, 0x2e, 0x74, 0x79, + 0x70, 0x65, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x75, 0x3b, + 0x75, 0x2b, 0x2b, 0x7d, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x2d, + 0x31, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x59, + 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, + 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, + 0x69, 0x6e, 0x20, 0x65, 0x29, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, + 0x65, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x6b, 0x65, + 0x79, 0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x6f, 0x20, 0x69, 0x6e, + 0x20, 0x6e, 0x7c, 0x7c, 0x74, 0x74, 0x28, 0x74, 0x2c, 0x6f, 0x2c, 0x6e, + 0x75, 0x6c, 0x6c, 0x2c, 0x65, 0x5b, 0x6f, 0x5d, 0x2c, 0x69, 0x29, 0x3b, + 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x5f, + 0x26, 0x26, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, + 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x5b, 0x6f, + 0x5d, 0x7c, 0x7c, 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, + 0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x6b, 0x65, 0x79, 0x22, + 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x22, 0x63, 0x68, 0x65, 0x63, + 0x6b, 0x65, 0x64, 0x22, 0x3d, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x65, 0x5b, + 0x6f, 0x5d, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x7c, 0x7c, 0x74, + 0x74, 0x28, 0x74, 0x2c, 0x6f, 0x2c, 0x6e, 0x5b, 0x6f, 0x5d, 0x2c, 0x65, + 0x5b, 0x6f, 0x5d, 0x2c, 0x69, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x5a, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, + 0x7b, 0x22, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x30, 0x5d, 0x3f, + 0x74, 0x2e, 0x73, 0x65, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, + 0x79, 0x28, 0x6e, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, + 0x22, 0x22, 0x3a, 0x65, 0x29, 0x3a, 0x74, 0x5b, 0x6e, 0x5d, 0x3d, 0x6e, + 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x22, 0x22, 0x3a, 0x22, 0x6e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, + 0x6f, 0x66, 0x20, 0x65, 0x7c, 0x7c, 0x56, 0x2e, 0x74, 0x65, 0x73, 0x74, + 0x28, 0x6e, 0x29, 0x3f, 0x65, 0x3a, 0x65, 0x2b, 0x22, 0x70, 0x78, 0x22, + 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x74, + 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x29, 0x7b, + 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3b, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x22, + 0x73, 0x74, 0x79, 0x6c, 0x65, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x29, 0x69, + 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, + 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x29, 0x74, 0x2e, 0x73, + 0x74, 0x79, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x73, 0x54, 0x65, 0x78, 0x74, + 0x3d, 0x65, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x69, 0x66, 0x28, 0x22, + 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, + 0x65, 0x6f, 0x66, 0x20, 0x69, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x73, 0x74, + 0x79, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x73, 0x54, 0x65, 0x78, 0x74, 0x3d, + 0x69, 0x3d, 0x22, 0x22, 0x29, 0x2c, 0x69, 0x29, 0x66, 0x6f, 0x72, 0x28, + 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x69, 0x29, 0x65, 0x26, 0x26, 0x6e, 0x20, + 0x69, 0x6e, 0x20, 0x65, 0x7c, 0x7c, 0x5a, 0x28, 0x74, 0x2e, 0x73, 0x74, + 0x79, 0x6c, 0x65, 0x2c, 0x6e, 0x2c, 0x22, 0x22, 0x29, 0x3b, 0x69, 0x66, + 0x28, 0x65, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20, + 0x65, 0x29, 0x69, 0x26, 0x26, 0x65, 0x5b, 0x6e, 0x5d, 0x3d, 0x3d, 0x3d, + 0x69, 0x5b, 0x6e, 0x5d, 0x7c, 0x7c, 0x5a, 0x28, 0x74, 0x2e, 0x73, 0x74, + 0x79, 0x6c, 0x65, 0x2c, 0x6e, 0x2c, 0x65, 0x5b, 0x6e, 0x5d, 0x29, 0x7d, + 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x22, 0x6f, 0x22, 0x3d, + 0x3d, 0x3d, 0x6e, 0x5b, 0x30, 0x5d, 0x26, 0x26, 0x22, 0x6e, 0x22, 0x3d, + 0x3d, 0x3d, 0x6e, 0x5b, 0x31, 0x5d, 0x29, 0x6f, 0x3d, 0x6e, 0x21, 0x3d, + 0x3d, 0x28, 0x6e, 0x3d, 0x6e, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, + 0x65, 0x28, 0x2f, 0x43, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x24, 0x2f, + 0x2c, 0x22, 0x22, 0x29, 0x29, 0x2c, 0x6e, 0x3d, 0x6e, 0x2e, 0x74, 0x6f, + 0x4c, 0x6f, 0x77, 0x65, 0x72, 0x43, 0x61, 0x73, 0x65, 0x28, 0x29, 0x69, + 0x6e, 0x20, 0x74, 0x3f, 0x6e, 0x2e, 0x74, 0x6f, 0x4c, 0x6f, 0x77, 0x65, + 0x72, 0x43, 0x61, 0x73, 0x65, 0x28, 0x29, 0x2e, 0x73, 0x6c, 0x69, 0x63, + 0x65, 0x28, 0x32, 0x29, 0x3a, 0x6e, 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, + 0x28, 0x32, 0x29, 0x2c, 0x74, 0x2e, 0x6c, 0x7c, 0x7c, 0x28, 0x74, 0x2e, + 0x6c, 0x3d, 0x7b, 0x7d, 0x29, 0x2c, 0x74, 0x2e, 0x6c, 0x5b, 0x6e, 0x2b, + 0x6f, 0x5d, 0x3d, 0x65, 0x2c, 0x65, 0x3f, 0x69, 0x7c, 0x7c, 0x74, 0x2e, + 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, + 0x65, 0x6e, 0x65, 0x72, 0x28, 0x6e, 0x2c, 0x6f, 0x3f, 0x65, 0x74, 0x3a, + 0x6e, 0x74, 0x2c, 0x6f, 0x29, 0x3a, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, + 0x76, 0x65, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, + 0x6e, 0x65, 0x72, 0x28, 0x6e, 0x2c, 0x6f, 0x3f, 0x65, 0x74, 0x3a, 0x6e, + 0x74, 0x2c, 0x6f, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, + 0x28, 0x22, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, + 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, + 0x4c, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x5f, + 0x29, 0x6e, 0x3d, 0x6e, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, + 0x28, 0x2f, 0x78, 0x6c, 0x69, 0x6e, 0x6b, 0x28, 0x48, 0x7c, 0x3a, 0x68, + 0x29, 0x2f, 0x2c, 0x22, 0x68, 0x22, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, + 0x61, 0x63, 0x65, 0x28, 0x2f, 0x73, 0x4e, 0x61, 0x6d, 0x65, 0x24, 0x2f, + 0x2c, 0x22, 0x73, 0x22, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, + 0x66, 0x28, 0x22, 0x77, 0x69, 0x64, 0x74, 0x68, 0x22, 0x21, 0x3d, 0x3d, + 0x6e, 0x26, 0x26, 0x22, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x22, 0x21, + 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x68, 0x72, 0x65, 0x66, 0x22, 0x21, + 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x6c, 0x69, 0x73, 0x74, 0x22, 0x21, + 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x66, 0x6f, 0x72, 0x6d, 0x22, 0x21, + 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x74, 0x61, 0x62, 0x49, 0x6e, 0x64, + 0x65, 0x78, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x64, 0x6f, + 0x77, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, + 0x26, 0x22, 0x72, 0x6f, 0x77, 0x53, 0x70, 0x61, 0x6e, 0x22, 0x21, 0x3d, + 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x63, 0x6f, 0x6c, 0x53, 0x70, 0x61, 0x6e, + 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x6e, 0x20, 0x69, 0x6e, 0x20, + 0x74, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x5b, 0x6e, 0x5d, 0x3d, 0x6e, + 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x22, 0x22, 0x3a, 0x65, 0x3b, + 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x74, 0x7d, 0x63, 0x61, 0x74, 0x63, + 0x68, 0x28, 0x74, 0x29, 0x7b, 0x7d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, + 0x20, 0x65, 0x7c, 0x7c, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, + 0x7c, 0x7c, 0x21, 0x31, 0x3d, 0x3d, 0x3d, 0x65, 0x26, 0x26, 0x22, 0x2d, + 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x5b, 0x34, 0x5d, 0x3f, 0x74, 0x2e, 0x72, + 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, + 0x74, 0x65, 0x28, 0x6e, 0x29, 0x3a, 0x74, 0x2e, 0x73, 0x65, 0x74, 0x41, + 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x2c, 0x65, + 0x29, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x20, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, 0x5b, 0x74, 0x2e, 0x74, + 0x79, 0x70, 0x65, 0x2b, 0x21, 0x31, 0x5d, 0x28, 0x77, 0x2e, 0x65, 0x76, + 0x65, 0x6e, 0x74, 0x3f, 0x77, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x28, + 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x20, 0x65, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, 0x5b, 0x74, + 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x30, 0x5d, 0x28, 0x77, 0x2e, + 0x65, 0x76, 0x65, 0x6e, 0x74, 0x3f, 0x77, 0x2e, 0x65, 0x76, 0x65, 0x6e, + 0x74, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, + 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, + 0x66, 0x2c, 0x6c, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x73, 0x2c, 0x63, + 0x2c, 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x2c, 0x64, 0x2c, 0x76, 0x2c, 0x79, + 0x2c, 0x6d, 0x2c, 0x67, 0x2c, 0x62, 0x2c, 0x6b, 0x2c, 0x53, 0x2c, 0x78, + 0x2c, 0x43, 0x2c, 0x45, 0x3d, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3b, + 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, + 0x6e, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, + 0x72, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, + 0x6c, 0x3b, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, + 0x68, 0x26, 0x26, 0x28, 0x66, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x68, 0x2c, + 0x75, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, + 0x65, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, + 0x2c, 0x6f, 0x3d, 0x5b, 0x75, 0x5d, 0x29, 0x2c, 0x28, 0x73, 0x3d, 0x77, + 0x2e, 0x5f, 0x5f, 0x62, 0x29, 0x26, 0x26, 0x73, 0x28, 0x6e, 0x29, 0x3b, + 0x74, 0x72, 0x79, 0x7b, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, + 0x65, 0x6f, 0x66, 0x20, 0x45, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x79, 0x3d, + 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x6d, 0x3d, 0x28, 0x73, + 0x3d, 0x45, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x54, 0x79, + 0x70, 0x65, 0x29, 0x26, 0x26, 0x69, 0x5b, 0x73, 0x2e, 0x5f, 0x5f, 0x63, + 0x5d, 0x2c, 0x67, 0x3d, 0x73, 0x3f, 0x6d, 0x3f, 0x6d, 0x2e, 0x70, 0x72, + 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x73, 0x2e, + 0x5f, 0x5f, 0x3a, 0x69, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x63, 0x3f, 0x76, + 0x3d, 0x28, 0x63, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x65, 0x2e, + 0x5f, 0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, + 0x45, 0x3a, 0x28, 0x22, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, + 0x65, 0x22, 0x69, 0x6e, 0x20, 0x45, 0x26, 0x26, 0x45, 0x2e, 0x70, 0x72, + 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65, 0x6e, 0x64, + 0x65, 0x72, 0x3f, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x63, 0x3d, 0x6e, + 0x65, 0x77, 0x20, 0x45, 0x28, 0x79, 0x2c, 0x67, 0x29, 0x3a, 0x28, 0x6e, + 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x63, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x49, + 0x28, 0x79, 0x2c, 0x67, 0x29, 0x2c, 0x63, 0x2e, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3d, 0x45, 0x2c, 0x63, 0x2e, + 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x3d, 0x66, 0x74, 0x29, 0x2c, 0x6d, + 0x26, 0x26, 0x6d, 0x2e, 0x73, 0x75, 0x62, 0x28, 0x63, 0x29, 0x2c, 0x63, + 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63, 0x2e, 0x73, + 0x74, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, 0x63, 0x2e, 0x73, 0x74, 0x61, + 0x74, 0x65, 0x3d, 0x7b, 0x7d, 0x29, 0x2c, 0x63, 0x2e, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x78, 0x74, 0x3d, 0x67, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x6e, + 0x3d, 0x69, 0x2c, 0x68, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, + 0x30, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x63, + 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x29, 0x2c, 0x6e, 0x75, 0x6c, + 0x6c, 0x3d, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x28, 0x63, + 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, + 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x45, 0x2e, 0x67, 0x65, + 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, + 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, + 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x3d, 0x63, 0x2e, 0x73, 0x74, + 0x61, 0x74, 0x65, 0x26, 0x26, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, + 0x46, 0x28, 0x7b, 0x7d, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x29, 0x29, + 0x2c, 0x46, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x45, 0x2e, 0x67, + 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, + 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x28, + 0x79, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x29, 0x29, 0x29, 0x2c, 0x61, + 0x3d, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x70, 0x3d, 0x63, + 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x76, + 0x3d, 0x6e, 0x2c, 0x68, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x45, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, - 0x73, 0x28, 0x76, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x29, 0x29, 0x29, - 0x2c, 0x68, 0x3d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x61, - 0x3d, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x73, 0x2e, 0x5f, - 0x5f, 0x76, 0x3d, 0x6e, 0x2c, 0x63, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, - 0x3d, 0x43, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, - 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, - 0x6f, 0x70, 0x73, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73, - 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, - 0x6c, 0x6c, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x73, 0x2e, 0x63, + 0x73, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, - 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, - 0x21, 0x3d, 0x73, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x44, 0x69, 0x64, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x73, - 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x73, 0x2e, - 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, - 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, - 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x43, 0x2e, 0x67, - 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, - 0x26, 0x76, 0x21, 0x3d, 0x3d, 0x68, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, - 0x21, 0x3d, 0x73, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65, 0x63, 0x65, 0x69, 0x76, 0x65, - 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x73, 0x2e, 0x63, 0x6f, 0x6d, - 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65, - 0x63, 0x65, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x28, 0x76, - 0x2c, 0x6d, 0x29, 0x2c, 0x21, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x26, 0x26, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73, 0x2e, 0x73, 0x68, 0x6f, 0x75, - 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x21, 0x31, 0x3d, 0x3d, 0x3d, - 0x73, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, - 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, - 0x76, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x6d, 0x29, 0x7c, 0x7c, - 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, - 0x76, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, - 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, 0x28, 0x73, - 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x76, 0x2c, 0x73, 0x2e, 0x73, - 0x74, 0x61, 0x74, 0x65, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x73, - 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x29, 0x2c, 0x73, 0x2e, 0x5f, - 0x5f, 0x65, 0x3d, 0x21, 0x31, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, - 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, - 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, - 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x26, 0x26, - 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x6e, 0x29, 0x7d, 0x29, 0x29, 0x2c, - 0x67, 0x3d, 0x30, 0x3b, 0x67, 0x3c, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x67, 0x2b, 0x2b, 0x29, 0x73, - 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x73, 0x2e, - 0x5f, 0x73, 0x62, 0x5b, 0x67, 0x5d, 0x29, 0x3b, 0x73, 0x2e, 0x5f, 0x73, - 0x62, 0x3d, 0x5b, 0x5d, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x6c, - 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, 0x72, 0x2e, 0x70, 0x75, 0x73, - 0x68, 0x28, 0x73, 0x29, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x74, - 0x7d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73, 0x2e, 0x63, 0x6f, 0x6d, - 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x73, 0x2e, 0x63, 0x6f, 0x6d, 0x70, - 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x28, 0x76, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, - 0x6d, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73, 0x2e, 0x63, + 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x63, 0x2e, 0x63, 0x6f, 0x6d, + 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x4d, 0x6f, + 0x75, 0x6e, 0x74, 0x28, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, + 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, + 0x69, 0x64, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x63, 0x2e, 0x5f, + 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x2e, 0x63, 0x6f, + 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x4d, 0x6f, + 0x75, 0x6e, 0x74, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x69, 0x66, + 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x45, 0x2e, 0x67, 0x65, 0x74, + 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, + 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x79, + 0x21, 0x3d, 0x3d, 0x61, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, + 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, + 0x69, 0x6c, 0x6c, 0x52, 0x65, 0x63, 0x65, 0x69, 0x76, 0x65, 0x50, 0x72, + 0x6f, 0x70, 0x73, 0x26, 0x26, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, + 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65, 0x63, 0x65, + 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x28, 0x79, 0x2c, 0x67, + 0x29, 0x2c, 0x21, 0x63, 0x2e, 0x5f, 0x5f, 0x65, 0x26, 0x26, 0x28, 0x6e, + 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, + 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, + 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x21, 0x31, 0x3d, 0x3d, 0x3d, 0x63, + 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, + 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x79, + 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x67, 0x29, 0x7c, 0x7c, 0x6e, + 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, + 0x29, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, + 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, 0x28, 0x63, + 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63, 0x2e, 0x73, + 0x74, 0x61, 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x63, + 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, + 0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f, + 0x5f, 0x6b, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6e, 0x2e, 0x5f, + 0x5f, 0x6b, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, + 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, + 0x74, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x6e, 0x29, 0x7d, + 0x29, 0x29, 0x2c, 0x62, 0x3d, 0x30, 0x3b, 0x62, 0x3c, 0x63, 0x2e, 0x5f, + 0x73, 0x62, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x62, 0x2b, + 0x2b, 0x29, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, + 0x28, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x5b, 0x62, 0x5d, 0x29, 0x3b, 0x63, + 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, + 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, 0x72, 0x2e, + 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x29, 0x3b, 0x62, 0x72, 0x65, 0x61, + 0x6b, 0x20, 0x74, 0x7d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63, 0x2e, + 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, + 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x63, 0x2e, 0x63, + 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, + 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x79, 0x2c, 0x63, 0x2e, 0x5f, + 0x5f, 0x73, 0x2c, 0x67, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, + 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, + 0x69, 0x64, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x63, 0x2e, + 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x28, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x55, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x73, 0x2e, 0x5f, 0x5f, 0x68, - 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x73, 0x2e, 0x63, 0x6f, 0x6d, 0x70, - 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x55, 0x70, 0x64, 0x61, - 0x74, 0x65, 0x28, 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x29, 0x7d, 0x29, 0x29, - 0x7d, 0x69, 0x66, 0x28, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, - 0x74, 0x3d, 0x6d, 0x2c, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, - 0x76, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x50, 0x3d, 0x74, 0x2c, 0x62, 0x3d, - 0x53, 0x2e, 0x5f, 0x5f, 0x72, 0x2c, 0x6b, 0x3d, 0x30, 0x2c, 0x22, 0x70, + 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x61, 0x2c, 0x70, 0x2c, 0x64, 0x29, + 0x7d, 0x29, 0x29, 0x7d, 0x69, 0x66, 0x28, 0x63, 0x2e, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x78, 0x74, 0x3d, 0x67, 0x2c, 0x63, 0x2e, 0x70, 0x72, 0x6f, + 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x50, 0x3d, 0x74, + 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21, 0x31, 0x2c, 0x6b, 0x3d, + 0x77, 0x2e, 0x5f, 0x5f, 0x72, 0x2c, 0x53, 0x3d, 0x30, 0x2c, 0x22, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x22, 0x69, 0x6e, 0x20, - 0x43, 0x26, 0x26, 0x43, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, + 0x45, 0x26, 0x26, 0x45, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x29, 0x7b, 0x66, - 0x6f, 0x72, 0x28, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x73, - 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, - 0x31, 0x2c, 0x62, 0x26, 0x26, 0x62, 0x28, 0x6e, 0x29, 0x2c, 0x66, 0x3d, - 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x73, 0x2e, 0x70, - 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, - 0x2c, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x29, 0x2c, - 0x78, 0x3d, 0x30, 0x3b, 0x78, 0x3c, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x78, 0x2b, 0x2b, 0x29, 0x73, - 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x73, 0x2e, - 0x5f, 0x73, 0x62, 0x5b, 0x78, 0x5d, 0x29, 0x3b, 0x73, 0x2e, 0x5f, 0x73, + 0x6f, 0x72, 0x28, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x63, + 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, + 0x31, 0x2c, 0x6b, 0x26, 0x26, 0x6b, 0x28, 0x6e, 0x29, 0x2c, 0x73, 0x3d, + 0x63, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x63, 0x2e, 0x70, + 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, + 0x2c, 0x63, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x29, 0x2c, + 0x78, 0x3d, 0x30, 0x3b, 0x78, 0x3c, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x2e, + 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x78, 0x2b, 0x2b, 0x29, 0x63, + 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x2e, + 0x5f, 0x73, 0x62, 0x5b, 0x78, 0x5d, 0x29, 0x3b, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x64, 0x6f, - 0x7b, 0x73, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x2c, 0x62, 0x26, - 0x26, 0x62, 0x28, 0x6e, 0x29, 0x2c, 0x66, 0x3d, 0x73, 0x2e, 0x72, 0x65, - 0x6e, 0x64, 0x65, 0x72, 0x28, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, - 0x2c, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x73, 0x2e, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x29, 0x2c, 0x73, 0x2e, 0x73, 0x74, - 0x61, 0x74, 0x65, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x7d, 0x77, 0x68, - 0x69, 0x6c, 0x65, 0x28, 0x73, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x2b, - 0x2b, 0x6b, 0x3c, 0x32, 0x35, 0x29, 0x3b, 0x73, 0x2e, 0x73, 0x74, 0x61, - 0x74, 0x65, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x6e, 0x75, 0x6c, - 0x6c, 0x21, 0x3d, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, + 0x7b, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x2c, 0x6b, 0x26, + 0x26, 0x6b, 0x28, 0x6e, 0x29, 0x2c, 0x73, 0x3d, 0x63, 0x2e, 0x72, 0x65, + 0x6e, 0x64, 0x65, 0x72, 0x28, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, + 0x2c, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x63, 0x2e, 0x63, + 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x29, 0x2c, 0x63, 0x2e, 0x73, 0x74, + 0x61, 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x7d, 0x77, 0x68, + 0x69, 0x6c, 0x65, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x2b, + 0x2b, 0x53, 0x3c, 0x32, 0x35, 0x29, 0x3b, 0x63, 0x2e, 0x73, 0x74, 0x61, + 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x6e, 0x75, 0x6c, + 0x6c, 0x21, 0x3d, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x26, 0x26, 0x28, 0x69, - 0x3d, 0x56, 0x28, 0x56, 0x28, 0x7b, 0x7d, 0x2c, 0x69, 0x29, 0x2c, 0x73, + 0x3d, 0x46, 0x28, 0x46, 0x28, 0x7b, 0x7d, 0x2c, 0x69, 0x29, 0x2c, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, - 0x74, 0x65, 0x78, 0x74, 0x28, 0x29, 0x29, 0x29, 0x2c, 0x63, 0x7c, 0x7c, - 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x53, + 0x74, 0x65, 0x78, 0x74, 0x28, 0x29, 0x29, 0x29, 0x2c, 0x68, 0x7c, 0x7c, + 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, - 0x65, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, 0x70, 0x3d, - 0x73, 0x2e, 0x67, 0x65, 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, + 0x65, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, 0x64, 0x3d, + 0x63, 0x2e, 0x67, 0x65, 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, 0x65, 0x55, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x28, 0x68, 0x2c, 0x61, 0x29, 0x29, 0x2c, 0x42, 0x28, 0x74, 0x2c, - 0x54, 0x28, 0x77, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x66, 0x26, - 0x26, 0x66, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d, 0x3d, 0x4f, 0x26, - 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x66, 0x2e, 0x6b, 0x65, 0x79, - 0x3f, 0x66, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68, 0x69, - 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x66, 0x29, 0x3f, 0x77, 0x3a, 0x5b, - 0x77, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, - 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x6c, 0x29, 0x2c, 0x73, 0x2e, 0x62, 0x61, - 0x73, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f, - 0x5f, 0x68, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, - 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, 0x72, 0x2e, - 0x70, 0x75, 0x73, 0x68, 0x28, 0x73, 0x29, 0x2c, 0x64, 0x26, 0x26, 0x28, - 0x73, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x3d, 0x6e, - 0x75, 0x6c, 0x6c, 0x29, 0x2c, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21, - 0x31, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, - 0x3d, 0x6f, 0x26, 0x26, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x3d, 0x3d, - 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x3f, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, - 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, - 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x3a, 0x6e, 0x2e, 0x5f, 0x5f, - 0x65, 0x3d, 0x69, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, - 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x6c, - 0x29, 0x3b, 0x28, 0x66, 0x3d, 0x53, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, - 0x64, 0x29, 0x26, 0x26, 0x66, 0x28, 0x6e, 0x29, 0x7d, 0x63, 0x61, 0x74, - 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, - 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x28, 0x6c, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, - 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x26, 0x26, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, - 0x65, 0x3d, 0x75, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x21, 0x21, - 0x6c, 0x2c, 0x6f, 0x5b, 0x6f, 0x2e, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f, - 0x66, 0x28, 0x75, 0x29, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x2c, - 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, - 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x65, - 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x53, 0x2e, 0x5f, 0x5f, 0x63, - 0x26, 0x26, 0x53, 0x2e, 0x5f, 0x5f, 0x63, 0x28, 0x6e, 0x2c, 0x74, 0x29, - 0x2c, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x72, 0x79, - 0x7b, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2c, 0x6e, 0x2e, 0x5f, - 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, - 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x29, 0x7b, 0x74, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x6e, 0x29, 0x7d, - 0x29, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, - 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, - 0x76, 0x29, 0x7d, 0x7d, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, - 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x6c, 0x2c, 0x66, 0x2c, 0x73, 0x2c, 0x63, 0x3d, - 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x68, 0x3d, 0x6e, 0x2e, - 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x61, 0x3d, 0x6e, 0x2e, 0x74, 0x79, - 0x70, 0x65, 0x2c, 0x70, 0x3d, 0x30, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x73, - 0x76, 0x67, 0x22, 0x3d, 0x3d, 0x3d, 0x61, 0x26, 0x26, 0x28, 0x5f, 0x3d, - 0x21, 0x30, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, - 0x66, 0x6f, 0x72, 0x28, 0x3b, 0x70, 0x3c, 0x6f, 0x2e, 0x6c, 0x65, 0x6e, - 0x67, 0x74, 0x68, 0x3b, 0x70, 0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x28, - 0x6c, 0x3d, 0x6f, 0x5b, 0x70, 0x5d, 0x29, 0x26, 0x26, 0x22, 0x73, 0x65, - 0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x22, 0x69, - 0x6e, 0x20, 0x6c, 0x3d, 0x3d, 0x21, 0x21, 0x61, 0x26, 0x26, 0x28, 0x61, - 0x3f, 0x6c, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x4e, 0x61, 0x6d, 0x65, - 0x3d, 0x3d, 0x3d, 0x61, 0x3a, 0x33, 0x3d, 0x3d, 0x3d, 0x6c, 0x2e, 0x6e, - 0x6f, 0x64, 0x65, 0x54, 0x79, 0x70, 0x65, 0x29, 0x29, 0x7b, 0x74, 0x3d, - 0x6c, 0x2c, 0x6f, 0x5b, 0x70, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, - 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, - 0x6c, 0x3d, 0x3d, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, - 0x6c, 0x3d, 0x3d, 0x3d, 0x61, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72, - 0x65, 0x61, 0x74, 0x65, 0x54, 0x65, 0x78, 0x74, 0x4e, 0x6f, 0x64, 0x65, - 0x28, 0x68, 0x29, 0x3b, 0x74, 0x3d, 0x5f, 0x3f, 0x64, 0x6f, 0x63, 0x75, - 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x45, - 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x4e, 0x53, 0x28, 0x22, 0x68, 0x74, - 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, 0x77, 0x77, 0x2e, 0x77, 0x33, 0x2e, - 0x6f, 0x72, 0x67, 0x2f, 0x32, 0x30, 0x30, 0x30, 0x2f, 0x73, 0x76, 0x67, - 0x22, 0x2c, 0x61, 0x29, 0x3a, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, - 0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x45, 0x6c, 0x65, 0x6d, - 0x65, 0x6e, 0x74, 0x28, 0x61, 0x2c, 0x68, 0x2e, 0x69, 0x73, 0x26, 0x26, - 0x68, 0x29, 0x2c, 0x6f, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x75, 0x3d, - 0x21, 0x31, 0x7d, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, - 0x3d, 0x61, 0x29, 0x63, 0x3d, 0x3d, 0x3d, 0x68, 0x7c, 0x7c, 0x75, 0x26, - 0x26, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x3d, 0x3d, 0x68, 0x7c, - 0x7c, 0x28, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x68, 0x29, 0x3b, - 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x69, 0x66, 0x28, 0x6f, 0x3d, 0x6f, 0x26, - 0x26, 0x6b, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x2e, 0x63, 0x68, - 0x69, 0x6c, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x73, 0x29, 0x2c, 0x66, 0x3d, - 0x28, 0x63, 0x3d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x7c, 0x7c, - 0x50, 0x29, 0x2e, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73, - 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, - 0x4d, 0x4c, 0x2c, 0x73, 0x3d, 0x68, 0x2e, 0x64, 0x61, 0x6e, 0x67, 0x65, - 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, - 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x2c, 0x21, 0x75, 0x29, 0x7b, 0x69, - 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x66, 0x6f, - 0x72, 0x28, 0x63, 0x3d, 0x7b, 0x7d, 0x2c, 0x70, 0x3d, 0x30, 0x3b, 0x70, - 0x3c, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, - 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x70, 0x2b, 0x2b, - 0x29, 0x63, 0x5b, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, - 0x74, 0x65, 0x73, 0x5b, 0x70, 0x5d, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, - 0x3d, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, - 0x73, 0x5b, 0x70, 0x5d, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x28, - 0x73, 0x7c, 0x7c, 0x66, 0x29, 0x26, 0x26, 0x28, 0x73, 0x26, 0x26, 0x28, - 0x66, 0x26, 0x26, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3d, - 0x3d, 0x66, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x7c, 0x7c, 0x73, - 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, - 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x29, 0x7c, 0x7c, - 0x28, 0x74, 0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, - 0x3d, 0x73, 0x26, 0x26, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, - 0x7c, 0x7c, 0x22, 0x22, 0x29, 0x29, 0x7d, 0x69, 0x66, 0x28, 0x51, 0x28, - 0x74, 0x2c, 0x68, 0x2c, 0x63, 0x2c, 0x5f, 0x2c, 0x75, 0x29, 0x2c, 0x73, - 0x29, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x5b, 0x5d, 0x3b, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x42, 0x28, 0x74, 0x2c, 0x54, 0x28, - 0x70, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68, - 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x29, 0x3f, 0x70, 0x3a, 0x5b, 0x70, - 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x26, 0x26, 0x22, - 0x66, 0x6f, 0x72, 0x65, 0x69, 0x67, 0x6e, 0x4f, 0x62, 0x6a, 0x65, 0x63, - 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x61, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x6f, - 0x3f, 0x6f, 0x5b, 0x30, 0x5d, 0x3a, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x26, - 0x26, 0x52, 0x28, 0x65, 0x2c, 0x30, 0x29, 0x2c, 0x75, 0x29, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x70, - 0x3d, 0x6f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x70, 0x2d, - 0x2d, 0x3b, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x5b, 0x70, - 0x5d, 0x26, 0x26, 0x41, 0x28, 0x6f, 0x5b, 0x70, 0x5d, 0x29, 0x3b, 0x75, - 0x7c, 0x7c, 0x28, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x69, 0x6e, - 0x20, 0x68, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, - 0x3d, 0x28, 0x70, 0x3d, 0x68, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, - 0x26, 0x26, 0x28, 0x70, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x7c, 0x7c, 0x22, 0x70, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, - 0x73, 0x22, 0x3d, 0x3d, 0x3d, 0x61, 0x26, 0x26, 0x21, 0x70, 0x7c, 0x7c, - 0x22, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x61, - 0x26, 0x26, 0x70, 0x21, 0x3d, 0x3d, 0x63, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x29, 0x26, 0x26, 0x59, 0x28, 0x74, 0x2c, 0x22, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x22, 0x2c, 0x70, 0x2c, 0x63, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2c, 0x21, 0x31, 0x29, 0x2c, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, - 0x65, 0x64, 0x22, 0x69, 0x6e, 0x20, 0x68, 0x26, 0x26, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x28, 0x70, 0x3d, 0x68, 0x2e, 0x63, - 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x29, 0x26, 0x26, 0x70, 0x21, 0x3d, - 0x3d, 0x74, 0x2e, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x26, 0x26, - 0x59, 0x28, 0x74, 0x2c, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, - 0x22, 0x2c, 0x70, 0x2c, 0x63, 0x2e, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, - 0x64, 0x2c, 0x21, 0x31, 0x29, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x5f, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x74, - 0x72, 0x79, 0x7b, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x3f, - 0x74, 0x28, 0x6e, 0x29, 0x3a, 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, - 0x6e, 0x74, 0x3d, 0x6e, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, - 0x29, 0x7b, 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x65, 0x29, - 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f, - 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, - 0x20, 0x69, 0x2c, 0x5f, 0x3b, 0x69, 0x66, 0x28, 0x53, 0x2e, 0x75, 0x6e, - 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x53, 0x2e, 0x75, 0x6e, 0x6d, - 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x2c, 0x28, 0x69, 0x3d, 0x74, - 0x2e, 0x72, 0x65, 0x66, 0x29, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x26, 0x26, 0x69, 0x2e, 0x63, 0x75, 0x72, - 0x72, 0x65, 0x6e, 0x74, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x65, - 0x7c, 0x7c, 0x5f, 0x74, 0x28, 0x69, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, - 0x6e, 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x69, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x29, 0x7b, 0x69, 0x66, 0x28, - 0x69, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, - 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x29, 0x74, - 0x72, 0x79, 0x7b, 0x69, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, - 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, - 0x74, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, - 0x7b, 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7d, - 0x69, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x50, - 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, - 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x69, 0x3d, - 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x5f, 0x3d, - 0x30, 0x3b, 0x5f, 0x3c, 0x69, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x3b, 0x5f, 0x2b, 0x2b, 0x29, 0x69, 0x5b, 0x5f, 0x5d, 0x26, 0x26, 0x6f, - 0x74, 0x28, 0x69, 0x5b, 0x5f, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x7c, 0x7c, - 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, - 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, - 0x65, 0x29, 0x3b, 0x65, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, - 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x7c, 0x7c, 0x41, 0x28, 0x74, 0x2e, 0x5f, - 0x5f, 0x65, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, - 0x5f, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x72, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x74, - 0x2c, 0x65, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x75, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, - 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x3b, 0x53, 0x2e, 0x5f, - 0x5f, 0x26, 0x26, 0x53, 0x2e, 0x5f, 0x5f, 0x28, 0x74, 0x2c, 0x6e, 0x29, - 0x2c, 0x5f, 0x3d, 0x28, 0x69, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, - 0x20, 0x65, 0x29, 0x3f, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x65, 0x26, 0x26, - 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x7c, 0x7c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, - 0x2c, 0x6f, 0x3d, 0x5b, 0x5d, 0x2c, 0x6e, 0x74, 0x28, 0x6e, 0x2c, 0x74, + 0x65, 0x28, 0x61, 0x2c, 0x70, 0x29, 0x29, 0x2c, 0x7a, 0x28, 0x74, 0x2c, + 0x41, 0x28, 0x43, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x73, 0x26, + 0x26, 0x73, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d, 0x3d, 0x52, 0x26, + 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x73, 0x2e, 0x6b, 0x65, 0x79, + 0x3f, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68, 0x69, + 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x73, 0x29, 0x3f, 0x43, 0x3a, 0x5b, + 0x43, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, + 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x2c, 0x6c, 0x29, 0x2c, 0x63, 0x2e, + 0x62, 0x61, 0x73, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, + 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x63, 0x2e, + 0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, + 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x29, 0x2c, 0x76, 0x26, + 0x26, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, + 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, + 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6f, 0x26, 0x26, 0x6e, 0x2e, 0x5f, + 0x5f, 0x76, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x3f, 0x28, + 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, + 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x29, + 0x3a, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x6f, 0x74, 0x28, 0x65, 0x2e, + 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x2c, + 0x6f, 0x2c, 0x72, 0x2c, 0x66, 0x2c, 0x6c, 0x29, 0x3b, 0x28, 0x73, 0x3d, + 0x77, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x29, 0x26, 0x26, 0x73, + 0x28, 0x6e, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, + 0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, + 0x28, 0x66, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, + 0x26, 0x26, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x75, 0x2c, 0x6e, + 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x21, 0x21, 0x66, 0x2c, 0x6f, 0x5b, 0x6f, + 0x2e, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x75, 0x29, 0x5d, + 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x2c, 0x77, 0x2e, 0x5f, 0x5f, 0x65, + 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x5f, 0x74, 0x28, 0x74, 0x2c, 0x6e, + 0x2c, 0x65, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, + 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x3c, 0x65, 0x2e, 0x6c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x3b, 0x69, 0x2b, 0x2b, 0x29, 0x72, 0x74, 0x28, 0x65, 0x5b, + 0x69, 0x5d, 0x2c, 0x65, 0x5b, 0x2b, 0x2b, 0x69, 0x5d, 0x2c, 0x65, 0x5b, + 0x2b, 0x2b, 0x69, 0x5d, 0x29, 0x3b, 0x77, 0x2e, 0x5f, 0x5f, 0x63, 0x26, + 0x26, 0x77, 0x2e, 0x5f, 0x5f, 0x63, 0x28, 0x6e, 0x2c, 0x74, 0x29, 0x2c, + 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, + 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, + 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, + 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, + 0x7b, 0x74, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x6e, 0x29, 0x7d, 0x29, + 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x77, + 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, + 0x29, 0x7d, 0x7d, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x20, 0x6f, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, + 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x29, + 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6c, 0x2c, 0x73, 0x2c, 0x63, 0x2c, 0x68, + 0x3d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x61, 0x3d, 0x6e, + 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x70, 0x3d, 0x6e, 0x2e, 0x74, + 0x79, 0x70, 0x65, 0x2c, 0x64, 0x3d, 0x30, 0x3b, 0x69, 0x66, 0x28, 0x22, + 0x73, 0x76, 0x67, 0x22, 0x3d, 0x3d, 0x3d, 0x70, 0x26, 0x26, 0x28, 0x5f, + 0x3d, 0x21, 0x30, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, + 0x29, 0x66, 0x6f, 0x72, 0x28, 0x3b, 0x64, 0x3c, 0x6f, 0x2e, 0x6c, 0x65, + 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x64, 0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, + 0x28, 0x6c, 0x3d, 0x6f, 0x5b, 0x64, 0x5d, 0x29, 0x26, 0x26, 0x22, 0x73, + 0x65, 0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x22, + 0x69, 0x6e, 0x20, 0x6c, 0x3d, 0x3d, 0x21, 0x21, 0x70, 0x26, 0x26, 0x28, + 0x70, 0x3f, 0x6c, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x4e, 0x61, 0x6d, + 0x65, 0x3d, 0x3d, 0x3d, 0x70, 0x3a, 0x33, 0x3d, 0x3d, 0x3d, 0x6c, 0x2e, + 0x6e, 0x6f, 0x64, 0x65, 0x54, 0x79, 0x70, 0x65, 0x29, 0x29, 0x7b, 0x74, + 0x3d, 0x6c, 0x2c, 0x6f, 0x5b, 0x64, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, + 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x69, 0x66, 0x28, 0x6e, 0x75, + 0x6c, 0x6c, 0x3d, 0x3d, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x6e, 0x75, + 0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x70, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, + 0x72, 0x65, 0x61, 0x74, 0x65, 0x54, 0x65, 0x78, 0x74, 0x4e, 0x6f, 0x64, + 0x65, 0x28, 0x61, 0x29, 0x3b, 0x74, 0x3d, 0x5f, 0x3f, 0x64, 0x6f, 0x63, + 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, + 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x4e, 0x53, 0x28, 0x22, 0x68, + 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, 0x77, 0x77, 0x2e, 0x77, 0x33, + 0x2e, 0x6f, 0x72, 0x67, 0x2f, 0x32, 0x30, 0x30, 0x30, 0x2f, 0x73, 0x76, + 0x67, 0x22, 0x2c, 0x70, 0x29, 0x3a, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, + 0x6e, 0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x45, 0x6c, 0x65, + 0x6d, 0x65, 0x6e, 0x74, 0x28, 0x70, 0x2c, 0x61, 0x2e, 0x69, 0x73, 0x26, + 0x26, 0x61, 0x29, 0x2c, 0x6f, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x75, + 0x3d, 0x21, 0x31, 0x7d, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, + 0x3d, 0x3d, 0x70, 0x29, 0x68, 0x3d, 0x3d, 0x3d, 0x61, 0x7c, 0x7c, 0x75, + 0x26, 0x26, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x3d, 0x3d, 0x61, + 0x7c, 0x7c, 0x28, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x61, 0x29, + 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x69, 0x66, 0x28, 0x6f, 0x3d, 0x6f, + 0x26, 0x26, 0x78, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x2e, 0x63, + 0x68, 0x69, 0x6c, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x73, 0x29, 0x2c, 0x73, + 0x3d, 0x28, 0x68, 0x3d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x7c, + 0x7c, 0x44, 0x29, 0x2e, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, + 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, + 0x54, 0x4d, 0x4c, 0x2c, 0x63, 0x3d, 0x61, 0x2e, 0x64, 0x61, 0x6e, 0x67, + 0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, + 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x2c, 0x21, 0x75, 0x29, 0x7b, + 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x66, + 0x6f, 0x72, 0x28, 0x68, 0x3d, 0x7b, 0x7d, 0x2c, 0x64, 0x3d, 0x30, 0x3b, + 0x64, 0x3c, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, + 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x64, 0x2b, + 0x2b, 0x29, 0x68, 0x5b, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, + 0x75, 0x74, 0x65, 0x73, 0x5b, 0x64, 0x5d, 0x2e, 0x6e, 0x61, 0x6d, 0x65, + 0x5d, 0x3d, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, + 0x65, 0x73, 0x5b, 0x64, 0x5d, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, + 0x28, 0x63, 0x7c, 0x7c, 0x73, 0x29, 0x26, 0x26, 0x28, 0x63, 0x26, 0x26, + 0x28, 0x73, 0x26, 0x26, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, + 0x3d, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x7c, 0x7c, + 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3d, 0x3d, 0x3d, 0x74, + 0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x29, 0x7c, + 0x7c, 0x28, 0x74, 0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, + 0x4c, 0x3d, 0x63, 0x26, 0x26, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x74, 0x6d, + 0x6c, 0x7c, 0x7c, 0x22, 0x22, 0x29, 0x29, 0x7d, 0x69, 0x66, 0x28, 0x59, + 0x28, 0x74, 0x2c, 0x61, 0x2c, 0x68, 0x2c, 0x5f, 0x2c, 0x75, 0x29, 0x2c, + 0x63, 0x29, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x5b, 0x5d, 0x3b, 0x65, + 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x7a, 0x28, 0x74, 0x2c, 0x41, + 0x28, 0x64, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, + 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x29, 0x3f, 0x64, 0x3a, 0x5b, + 0x64, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x2c, 0x5f, 0x26, 0x26, + 0x22, 0x66, 0x6f, 0x72, 0x65, 0x69, 0x67, 0x6e, 0x4f, 0x62, 0x6a, 0x65, + 0x63, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x70, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, + 0x6f, 0x3f, 0x6f, 0x5b, 0x30, 0x5d, 0x3a, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, + 0x26, 0x26, 0x6a, 0x28, 0x65, 0x2c, 0x30, 0x29, 0x2c, 0x75, 0x2c, 0x66, + 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x66, 0x6f, + 0x72, 0x28, 0x64, 0x3d, 0x6f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, + 0x3b, 0x64, 0x2d, 0x2d, 0x3b, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, + 0x6f, 0x5b, 0x64, 0x5d, 0x26, 0x26, 0x4d, 0x28, 0x6f, 0x5b, 0x64, 0x5d, + 0x29, 0x3b, 0x75, 0x7c, 0x7c, 0x28, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, + 0x22, 0x69, 0x6e, 0x20, 0x61, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x21, 0x3d, 0x3d, 0x28, 0x64, 0x3d, 0x61, 0x2e, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x29, 0x26, 0x26, 0x28, 0x64, 0x21, 0x3d, 0x3d, 0x74, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7c, 0x7c, 0x22, 0x70, 0x72, 0x6f, 0x67, + 0x72, 0x65, 0x73, 0x73, 0x22, 0x3d, 0x3d, 0x3d, 0x70, 0x26, 0x26, 0x21, + 0x64, 0x7c, 0x7c, 0x22, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, + 0x3d, 0x3d, 0x70, 0x26, 0x26, 0x64, 0x21, 0x3d, 0x3d, 0x68, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x29, 0x26, 0x26, 0x74, 0x74, 0x28, 0x74, 0x2c, + 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x64, 0x2c, 0x68, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x21, 0x31, 0x29, 0x2c, 0x22, 0x63, + 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x22, 0x69, 0x6e, 0x20, 0x61, 0x26, + 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x28, 0x64, + 0x3d, 0x61, 0x2e, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x29, 0x26, + 0x26, 0x64, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x63, 0x68, 0x65, 0x63, 0x6b, + 0x65, 0x64, 0x26, 0x26, 0x74, 0x74, 0x28, 0x74, 0x2c, 0x22, 0x63, 0x68, + 0x65, 0x63, 0x6b, 0x65, 0x64, 0x22, 0x2c, 0x64, 0x2c, 0x68, 0x2e, 0x63, + 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x2c, 0x21, 0x31, 0x29, 0x29, 0x7d, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x72, 0x74, 0x28, 0x74, 0x2c, 0x6e, + 0x2c, 0x65, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, 0x22, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, + 0x6f, 0x66, 0x20, 0x74, 0x3f, 0x74, 0x28, 0x6e, 0x29, 0x3a, 0x74, 0x2e, + 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x6e, 0x7d, 0x63, 0x61, + 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x77, 0x2e, 0x5f, 0x5f, 0x65, + 0x28, 0x74, 0x2c, 0x65, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, + 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, 0x3b, 0x69, 0x66, + 0x28, 0x77, 0x2e, 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, + 0x77, 0x2e, 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x74, 0x29, + 0x2c, 0x28, 0x69, 0x3d, 0x74, 0x2e, 0x72, 0x65, 0x66, 0x29, 0x26, 0x26, + 0x28, 0x69, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x26, 0x26, + 0x69, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x21, 0x3d, 0x3d, + 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x7c, 0x7c, 0x72, 0x74, 0x28, 0x69, 0x2c, + 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, + 0x6c, 0x21, 0x3d, 0x28, 0x69, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, + 0x29, 0x7b, 0x69, 0x66, 0x28, 0x69, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, + 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, + 0x75, 0x6e, 0x74, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x69, 0x2e, 0x63, 0x6f, + 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, + 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, + 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x77, 0x2e, 0x5f, 0x5f, 0x65, 0x28, + 0x74, 0x2c, 0x6e, 0x29, 0x7d, 0x69, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, + 0x69, 0x2e, 0x5f, 0x5f, 0x50, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x74, + 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, + 0x69, 0x66, 0x28, 0x69, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x66, + 0x6f, 0x72, 0x28, 0x5f, 0x3d, 0x30, 0x3b, 0x5f, 0x3c, 0x69, 0x2e, 0x6c, + 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x5f, 0x2b, 0x2b, 0x29, 0x69, 0x5b, + 0x5f, 0x5d, 0x26, 0x26, 0x75, 0x74, 0x28, 0x69, 0x5b, 0x5f, 0x5d, 0x2c, + 0x6e, 0x2c, 0x65, 0x7c, 0x7c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, + 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x65, 0x7c, 0x7c, 0x6e, + 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x7c, 0x7c, + 0x4d, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x2c, 0x74, 0x2e, 0x5f, + 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, + 0x64, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x74, 0x28, 0x74, 0x2c, 0x6e, + 0x2c, 0x65, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, + 0x74, 0x6f, 0x72, 0x28, 0x74, 0x2c, 0x65, 0x29, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x74, 0x28, 0x74, 0x2c, 0x6e, + 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, 0x2c, + 0x6f, 0x2c, 0x72, 0x3b, 0x77, 0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x77, 0x2e, + 0x5f, 0x5f, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x2c, 0x5f, 0x3d, 0x28, 0x69, + 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, + 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x29, 0x3f, 0x6e, + 0x75, 0x6c, 0x6c, 0x3a, 0x65, 0x26, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, + 0x7c, 0x7c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6f, 0x3d, 0x5b, 0x5d, + 0x2c, 0x72, 0x3d, 0x5b, 0x5d, 0x2c, 0x69, 0x74, 0x28, 0x6e, 0x2c, 0x74, 0x3d, 0x28, 0x21, 0x69, 0x26, 0x26, 0x65, 0x7c, 0x7c, 0x6e, 0x29, 0x2e, - 0x5f, 0x5f, 0x6b, 0x3d, 0x46, 0x28, 0x4f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, - 0x2c, 0x5b, 0x74, 0x5d, 0x29, 0x2c, 0x5f, 0x7c, 0x7c, 0x50, 0x2c, 0x50, + 0x5f, 0x5f, 0x6b, 0x3d, 0x57, 0x28, 0x52, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, + 0x2c, 0x5b, 0x74, 0x5d, 0x29, 0x2c, 0x5f, 0x7c, 0x7c, 0x44, 0x2c, 0x44, 0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x6f, 0x77, 0x6e, 0x65, 0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x21, 0x69, 0x26, 0x26, 0x65, 0x3f, 0x5b, 0x65, 0x5d, 0x3a, 0x5f, 0x3f, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x6e, 0x2e, 0x66, - 0x69, 0x72, 0x73, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x3f, 0x6b, 0x2e, + 0x69, 0x72, 0x73, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x3f, 0x78, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x6e, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x73, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6f, 0x2c, 0x21, 0x69, 0x26, 0x26, 0x65, 0x3f, 0x65, 0x3a, 0x5f, 0x3f, 0x5f, 0x2e, 0x5f, 0x5f, 0x65, 0x3a, 0x6e, 0x2e, 0x66, 0x69, 0x72, 0x73, - 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x2c, 0x69, 0x29, 0x2c, 0x65, 0x74, - 0x28, 0x6f, 0x2c, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x6c, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x75, - 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x6c, 0x74, 0x29, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x74, 0x28, 0x74, 0x2c, - 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x2c, 0x5f, - 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x3d, 0x56, 0x28, 0x7b, 0x7d, 0x2c, - 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x3b, 0x66, 0x6f, 0x72, - 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, - 0x26, 0x26, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, 0x65, 0x66, - 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x28, - 0x72, 0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, 0x65, 0x66, - 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x2c, 0x6e, - 0x29, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x69, 0x3d, - 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d, 0x3d, - 0x6f, 0x3f, 0x5f, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x75, 0x5b, 0x6f, - 0x5d, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6e, - 0x5b, 0x6f, 0x5d, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, - 0x3d, 0x3d, 0x72, 0x3f, 0x72, 0x5b, 0x6f, 0x5d, 0x3a, 0x6e, 0x5b, 0x6f, - 0x5d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x72, 0x67, - 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x3e, 0x32, 0x26, 0x26, 0x28, 0x75, 0x2e, 0x63, 0x68, 0x69, 0x6c, - 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, - 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x33, 0x3f, - 0x6b, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67, 0x75, 0x6d, - 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29, 0x2c, 0x4d, - 0x28, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x75, 0x2c, 0x69, 0x7c, - 0x7c, 0x74, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x5f, 0x7c, 0x7c, 0x74, 0x2e, - 0x72, 0x65, 0x66, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66, 0x75, + 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x2c, 0x69, 0x2c, 0x72, 0x29, 0x2c, + 0x5f, 0x74, 0x28, 0x6f, 0x2c, 0x74, 0x2c, 0x72, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x73, 0x74, 0x28, 0x74, 0x2c, - 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x7b, 0x5f, 0x5f, - 0x63, 0x3a, 0x6e, 0x3d, 0x22, 0x5f, 0x5f, 0x63, 0x43, 0x22, 0x2b, 0x4e, - 0x2b, 0x2b, 0x2c, 0x5f, 0x5f, 0x3a, 0x74, 0x2c, 0x43, 0x6f, 0x6e, 0x73, - 0x75, 0x6d, 0x65, 0x72, 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, - 0x28, 0x6e, 0x29, 0x7d, 0x2c, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, - 0x72, 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x2c, 0x69, 0x3b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x65, - 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, - 0x74, 0x7c, 0x7c, 0x28, 0x65, 0x3d, 0x5b, 0x5d, 0x2c, 0x28, 0x69, 0x3d, - 0x7b, 0x7d, 0x29, 0x5b, 0x6e, 0x5d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2c, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, - 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x69, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, - 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, - 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x26, - 0x26, 0x65, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f, - 0x5f, 0x65, 0x3d, 0x21, 0x30, 0x2c, 0x6a, 0x28, 0x74, 0x29, 0x7d, 0x29, - 0x29, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x75, 0x62, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, - 0x65, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, - 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, - 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, - 0x6e, 0x74, 0x3b, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, - 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, - 0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, - 0x7b, 0x65, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x65, 0x2e, - 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x74, 0x29, 0x2c, 0x31, - 0x29, 0x2c, 0x6e, 0x26, 0x26, 0x6e, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, - 0x74, 0x29, 0x7d, 0x7d, 0x29, 0x2c, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, - 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x7d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x65, 0x2e, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x72, - 0x2e, 0x5f, 0x5f, 0x3d, 0x65, 0x2e, 0x43, 0x6f, 0x6e, 0x73, 0x75, 0x6d, - 0x65, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x54, 0x79, - 0x70, 0x65, 0x3d, 0x65, 0x7d, 0x6b, 0x3d, 0x44, 0x2e, 0x73, 0x6c, 0x69, - 0x63, 0x65, 0x2c, 0x53, 0x3d, 0x7b, 0x5f, 0x5f, 0x65, 0x3a, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, - 0x2c, 0x69, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, - 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, - 0x3b, 0x29, 0x69, 0x66, 0x28, 0x28, 0x5f, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, - 0x63, 0x29, 0x26, 0x26, 0x21, 0x5f, 0x2e, 0x5f, 0x5f, 0x29, 0x74, 0x72, - 0x79, 0x7b, 0x69, 0x66, 0x28, 0x28, 0x6f, 0x3d, 0x5f, 0x2e, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x29, 0x26, 0x26, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x2e, 0x67, 0x65, 0x74, 0x44, - 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, - 0x72, 0x6f, 0x6d, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x26, 0x26, 0x28, 0x5f, - 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x6f, 0x2e, + 0x6e, 0x29, 0x7b, 0x6c, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x73, 0x74, + 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x63, + 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, + 0x20, 0x69, 0x2c, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x3d, 0x46, + 0x28, 0x7b, 0x7d, 0x2c, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, + 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x2e, + 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, + 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, + 0x73, 0x26, 0x26, 0x28, 0x72, 0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, + 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, + 0x73, 0x29, 0x2c, 0x6e, 0x29, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, + 0x6f, 0x3f, 0x69, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x22, 0x72, 0x65, + 0x66, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x5f, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, + 0x3a, 0x75, 0x5b, 0x6f, 0x5d, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, + 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x26, 0x26, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x72, 0x3f, 0x72, 0x5b, 0x6f, 0x5d, + 0x3a, 0x6e, 0x5b, 0x6f, 0x5d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, + 0x20, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, + 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x32, 0x26, 0x26, 0x28, 0x75, 0x2e, + 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, + 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, + 0x68, 0x3e, 0x33, 0x3f, 0x78, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, + 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, + 0x65, 0x29, 0x2c, 0x4f, 0x28, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, + 0x75, 0x2c, 0x69, 0x7c, 0x7c, 0x74, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x5f, + 0x7c, 0x7c, 0x74, 0x2e, 0x72, 0x65, 0x66, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, + 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x68, + 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, + 0x3d, 0x7b, 0x5f, 0x5f, 0x63, 0x3a, 0x6e, 0x3d, 0x22, 0x5f, 0x5f, 0x63, + 0x43, 0x22, 0x2b, 0x24, 0x2b, 0x2b, 0x2c, 0x5f, 0x5f, 0x3a, 0x74, 0x2c, + 0x43, 0x6f, 0x6e, 0x73, 0x75, 0x6d, 0x65, 0x72, 0x3a, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, + 0x64, 0x72, 0x65, 0x6e, 0x28, 0x6e, 0x29, 0x7d, 0x2c, 0x50, 0x72, 0x6f, + 0x76, 0x69, 0x64, 0x65, 0x72, 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x2c, + 0x69, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, + 0x6e, 0x74, 0x65, 0x78, 0x74, 0x7c, 0x7c, 0x28, 0x65, 0x3d, 0x5b, 0x5d, + 0x2c, 0x28, 0x69, 0x3d, 0x7b, 0x7d, 0x29, 0x5b, 0x6e, 0x5d, 0x3d, 0x74, + 0x68, 0x69, 0x73, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x65, 0x74, + 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, + 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69, 0x7d, 0x2c, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, + 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, + 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, + 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x26, 0x26, 0x65, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, + 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, + 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21, 0x30, 0x2c, 0x71, 0x28, + 0x74, 0x29, 0x7d, 0x29, 0x29, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x73, 0x75, 0x62, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x28, 0x74, 0x29, 0x7b, 0x65, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, + 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x63, 0x6f, + 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, + 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x3b, 0x74, 0x2e, 0x63, 0x6f, 0x6d, + 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, + 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x65, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x63, + 0x65, 0x28, 0x65, 0x2e, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, + 0x74, 0x29, 0x2c, 0x31, 0x29, 0x2c, 0x6e, 0x26, 0x26, 0x6e, 0x2e, 0x63, + 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x29, 0x2c, 0x74, 0x2e, + 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x7d, 0x3b, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x50, 0x72, 0x6f, 0x76, + 0x69, 0x64, 0x65, 0x72, 0x2e, 0x5f, 0x5f, 0x3d, 0x65, 0x2e, 0x43, 0x6f, + 0x6e, 0x73, 0x75, 0x6d, 0x65, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x78, 0x74, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x65, 0x7d, 0x78, 0x3d, 0x54, + 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x2c, 0x77, 0x3d, 0x7b, 0x5f, 0x5f, + 0x65, 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, + 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, + 0x76, 0x61, 0x72, 0x20, 0x5f, 0x2c, 0x6f, 0x2c, 0x72, 0x3b, 0x6e, 0x3d, + 0x6e, 0x2e, 0x5f, 0x5f, 0x3b, 0x29, 0x69, 0x66, 0x28, 0x28, 0x5f, 0x3d, + 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x26, 0x26, 0x21, 0x5f, 0x2e, 0x5f, + 0x5f, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x69, 0x66, 0x28, 0x28, 0x6f, 0x3d, + 0x5f, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, + 0x72, 0x29, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x45, 0x72, 0x72, 0x6f, 0x72, - 0x28, 0x74, 0x29, 0x29, 0x2c, 0x72, 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x64, - 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x5f, 0x2e, 0x63, 0x6f, - 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, - 0x74, 0x63, 0x68, 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x63, 0x6f, 0x6d, 0x70, - 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, - 0x68, 0x28, 0x74, 0x2c, 0x69, 0x7c, 0x7c, 0x7b, 0x7d, 0x29, 0x2c, 0x72, - 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x64, 0x29, 0x2c, 0x72, 0x29, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x5f, - 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x3d, - 0x6e, 0x7d, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x7d, 0x2c, - 0x78, 0x3d, 0x30, 0x2c, 0x77, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x26, 0x26, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x7d, 0x2c, 0x4c, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x65, - 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, - 0x20, 0x65, 0x3b, 0x65, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3f, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x5f, 0x73, 0x3a, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, - 0x3d, 0x56, 0x28, 0x7b, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, - 0x74, 0x61, 0x74, 0x65, 0x29, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, - 0x20, 0x74, 0x26, 0x26, 0x28, 0x74, 0x3d, 0x74, 0x28, 0x56, 0x28, 0x7b, - 0x7d, 0x2c, 0x65, 0x29, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, - 0x6f, 0x70, 0x73, 0x29, 0x29, 0x2c, 0x74, 0x26, 0x26, 0x56, 0x28, 0x65, - 0x2c, 0x74, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x26, - 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, 0x28, - 0x6e, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x2e, - 0x70, 0x75, 0x73, 0x68, 0x28, 0x6e, 0x29, 0x2c, 0x6a, 0x28, 0x74, 0x68, - 0x69, 0x73, 0x29, 0x29, 0x7d, 0x2c, 0x4c, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x55, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x5f, 0x76, 0x26, 0x26, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, - 0x65, 0x3d, 0x21, 0x30, 0x2c, 0x74, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, - 0x2c, 0x6a, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x29, 0x7d, 0x2c, 0x4c, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, - 0x65, 0x6e, 0x64, 0x65, 0x72, 0x3d, 0x4f, 0x2c, 0x43, 0x3d, 0x5b, 0x5d, - 0x2c, 0x55, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x50, 0x72, - 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x3f, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, - 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, - 0x74, 0x68, 0x65, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x50, 0x72, - 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x2e, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, - 0x65, 0x28, 0x29, 0x29, 0x3a, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, - 0x6f, 0x75, 0x74, 0x2c, 0x48, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x2e, 0x5f, 0x5f, 0x62, - 0x2d, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x2e, 0x5f, 0x5f, 0x62, 0x7d, 0x2c, - 0x71, 0x2e, 0x5f, 0x5f, 0x72, 0x3d, 0x30, 0x2c, 0x4e, 0x3d, 0x30, 0x3b, - 0x76, 0x61, 0x72, 0x20, 0x63, 0x74, 0x2c, 0x68, 0x74, 0x2c, 0x61, 0x74, - 0x2c, 0x70, 0x74, 0x2c, 0x64, 0x74, 0x3d, 0x30, 0x2c, 0x76, 0x74, 0x3d, - 0x5b, 0x5d, 0x2c, 0x79, 0x74, 0x3d, 0x5b, 0x5d, 0x2c, 0x6d, 0x74, 0x3d, - 0x53, 0x2e, 0x5f, 0x5f, 0x62, 0x2c, 0x67, 0x74, 0x3d, 0x53, 0x2e, 0x5f, - 0x5f, 0x72, 0x2c, 0x62, 0x74, 0x3d, 0x53, 0x2e, 0x64, 0x69, 0x66, 0x66, - 0x65, 0x64, 0x2c, 0x6b, 0x74, 0x3d, 0x53, 0x2e, 0x5f, 0x5f, 0x63, 0x2c, - 0x53, 0x74, 0x3d, 0x53, 0x2e, 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, - 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x78, 0x74, - 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x53, 0x2e, 0x5f, 0x5f, 0x68, 0x26, - 0x26, 0x53, 0x2e, 0x5f, 0x5f, 0x68, 0x28, 0x68, 0x74, 0x2c, 0x74, 0x2c, - 0x64, 0x74, 0x7c, 0x7c, 0x6e, 0x29, 0x2c, 0x64, 0x74, 0x3d, 0x30, 0x3b, - 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x68, 0x74, 0x2e, 0x5f, 0x5f, 0x48, - 0x7c, 0x7c, 0x28, 0x68, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x7b, 0x5f, - 0x5f, 0x3a, 0x5b, 0x5d, 0x2c, 0x5f, 0x5f, 0x68, 0x3a, 0x5b, 0x5d, 0x7d, - 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x3e, 0x3d, - 0x65, 0x2e, 0x5f, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, - 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x7b, - 0x5f, 0x5f, 0x56, 0x3a, 0x79, 0x74, 0x7d, 0x29, 0x2c, 0x65, 0x2e, 0x5f, - 0x5f, 0x5b, 0x74, 0x5d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x77, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x64, 0x74, 0x3d, 0x31, 0x2c, 0x43, 0x74, 0x28, 0x49, - 0x74, 0x2c, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x43, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x78, 0x74, 0x28, 0x63, 0x74, 0x2b, - 0x2b, 0x2c, 0x32, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x69, 0x2e, 0x74, 0x3d, - 0x74, 0x2c, 0x21, 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x26, 0x26, 0x28, 0x69, - 0x2e, 0x5f, 0x5f, 0x3d, 0x5b, 0x65, 0x3f, 0x65, 0x28, 0x6e, 0x29, 0x3a, - 0x49, 0x74, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x29, - 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, - 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, - 0x3f, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, 0x5b, 0x30, 0x5d, 0x3a, 0x69, 0x2e, - 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x2c, 0x65, 0x3d, 0x69, 0x2e, 0x74, 0x28, - 0x6e, 0x2c, 0x74, 0x29, 0x3b, 0x6e, 0x21, 0x3d, 0x3d, 0x65, 0x26, 0x26, - 0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x5b, 0x65, 0x2c, 0x69, 0x2e, - 0x5f, 0x5f, 0x5b, 0x31, 0x5d, 0x5d, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x63, - 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d, - 0x29, 0x29, 0x7d, 0x5d, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x68, - 0x74, 0x2c, 0x21, 0x68, 0x74, 0x2e, 0x75, 0x29, 0x29, 0x7b, 0x76, 0x61, - 0x72, 0x20, 0x5f, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, - 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, 0x5f, 0x48, 0x29, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x5f, - 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, - 0x5f, 0x2e, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x28, 0x28, 0x66, 0x75, + 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, + 0x65, 0x28, 0x6f, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, + 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x45, + 0x72, 0x72, 0x6f, 0x72, 0x28, 0x74, 0x29, 0x29, 0x2c, 0x72, 0x3d, 0x5f, + 0x2e, 0x5f, 0x5f, 0x64, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, + 0x5f, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, + 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, 0x68, 0x26, 0x26, 0x28, 0x5f, 0x2e, + 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, + 0x43, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x2c, 0x69, 0x7c, 0x7c, 0x7b, + 0x7d, 0x29, 0x2c, 0x72, 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x64, 0x29, 0x2c, + 0x72, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x2e, 0x5f, + 0x5f, 0x45, 0x3d, 0x5f, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, + 0x29, 0x7b, 0x74, 0x3d, 0x6e, 0x7d, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, + 0x74, 0x7d, 0x7d, 0x2c, 0x43, 0x3d, 0x30, 0x2c, 0x45, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x7d, 0x29, - 0x29, 0x3b, 0x69, 0x66, 0x28, 0x5f, 0x2e, 0x65, 0x76, 0x65, 0x72, 0x79, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, + 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, + 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, + 0x7d, 0x2c, 0x49, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, + 0x65, 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, + 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3b, 0x65, 0x3d, 0x6e, 0x75, 0x6c, + 0x6c, 0x21, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x26, + 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x21, 0x3d, 0x3d, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3f, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3a, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x46, 0x28, 0x7b, 0x7d, 0x2c, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x29, 0x2c, 0x22, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, + 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x26, 0x26, 0x28, 0x74, 0x3d, 0x74, + 0x28, 0x46, 0x28, 0x7b, 0x7d, 0x2c, 0x65, 0x29, 0x2c, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x29, 0x2c, 0x74, 0x26, + 0x26, 0x46, 0x28, 0x65, 0x2c, 0x74, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, + 0x21, 0x3d, 0x74, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, + 0x76, 0x26, 0x26, 0x28, 0x6e, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x5f, 0x73, 0x62, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x6e, 0x29, 0x2c, + 0x71, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x29, 0x7d, 0x2c, 0x49, 0x2e, + 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x66, 0x6f, + 0x72, 0x63, 0x65, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, 0x28, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21, 0x30, 0x2c, 0x74, 0x26, 0x26, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, + 0x68, 0x28, 0x74, 0x29, 0x2c, 0x71, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, + 0x29, 0x7d, 0x2c, 0x49, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, + 0x70, 0x65, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x3d, 0x52, 0x2c, + 0x55, 0x3d, 0x5b, 0x5d, 0x2c, 0x4e, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, + 0x66, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x3f, 0x50, 0x72, + 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, + 0x79, 0x70, 0x65, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x2e, 0x62, 0x69, 0x6e, + 0x64, 0x28, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x2e, 0x72, 0x65, + 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x29, 0x29, 0x3a, 0x73, 0x65, 0x74, + 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x2c, 0x50, 0x3d, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x76, + 0x2e, 0x5f, 0x5f, 0x62, 0x2d, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x2e, 0x5f, + 0x5f, 0x62, 0x7d, 0x2c, 0x47, 0x2e, 0x5f, 0x5f, 0x72, 0x3d, 0x30, 0x2c, + 0x24, 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x61, 0x74, 0x2c, 0x70, + 0x74, 0x2c, 0x64, 0x74, 0x2c, 0x76, 0x74, 0x2c, 0x79, 0x74, 0x3d, 0x30, + 0x2c, 0x6d, 0x74, 0x3d, 0x5b, 0x5d, 0x2c, 0x67, 0x74, 0x3d, 0x5b, 0x5d, + 0x2c, 0x62, 0x74, 0x3d, 0x77, 0x2e, 0x5f, 0x5f, 0x62, 0x2c, 0x6b, 0x74, + 0x3d, 0x77, 0x2e, 0x5f, 0x5f, 0x72, 0x2c, 0x53, 0x74, 0x3d, 0x77, 0x2e, + 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x2c, 0x78, 0x74, 0x3d, 0x77, 0x2e, + 0x5f, 0x5f, 0x63, 0x2c, 0x77, 0x74, 0x3d, 0x77, 0x2e, 0x75, 0x6e, 0x6d, + 0x6f, 0x75, 0x6e, 0x74, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x43, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x77, 0x2e, + 0x5f, 0x5f, 0x68, 0x26, 0x26, 0x77, 0x2e, 0x5f, 0x5f, 0x68, 0x28, 0x70, + 0x74, 0x2c, 0x74, 0x2c, 0x79, 0x74, 0x7c, 0x7c, 0x6e, 0x29, 0x2c, 0x79, + 0x74, 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x70, 0x74, + 0x2e, 0x5f, 0x5f, 0x48, 0x7c, 0x7c, 0x28, 0x70, 0x74, 0x2e, 0x5f, 0x5f, + 0x48, 0x3d, 0x7b, 0x5f, 0x5f, 0x3a, 0x5b, 0x5d, 0x2c, 0x5f, 0x5f, 0x68, + 0x3a, 0x5b, 0x5d, 0x7d, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, + 0x20, 0x74, 0x3e, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x26, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x2e, 0x70, 0x75, + 0x73, 0x68, 0x28, 0x7b, 0x5f, 0x5f, 0x56, 0x3a, 0x67, 0x74, 0x7d, 0x29, + 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x5b, 0x74, 0x5d, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x45, 0x74, 0x28, 0x74, 0x29, 0x7b, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x79, 0x74, 0x3d, 0x31, 0x2c, + 0x55, 0x74, 0x28, 0x42, 0x74, 0x2c, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x55, 0x74, 0x28, 0x74, 0x2c, 0x6e, + 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x43, 0x74, + 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x32, 0x29, 0x3b, 0x69, 0x66, 0x28, + 0x69, 0x2e, 0x74, 0x3d, 0x74, 0x2c, 0x21, 0x69, 0x2e, 0x5f, 0x5f, 0x63, + 0x26, 0x26, 0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x3d, 0x5b, 0x65, 0x3f, 0x65, + 0x28, 0x6e, 0x29, 0x3a, 0x42, 0x74, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x30, 0x2c, 0x6e, 0x29, 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x69, + 0x2e, 0x5f, 0x5f, 0x4e, 0x3f, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, 0x5b, 0x30, + 0x5d, 0x3a, 0x69, 0x2e, 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x2c, 0x65, 0x3d, + 0x69, 0x2e, 0x74, 0x28, 0x6e, 0x2c, 0x74, 0x29, 0x3b, 0x6e, 0x21, 0x3d, + 0x3d, 0x65, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x5b, + 0x65, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x5b, 0x31, 0x5d, 0x5d, 0x2c, 0x69, + 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, + 0x65, 0x28, 0x7b, 0x7d, 0x29, 0x29, 0x7d, 0x5d, 0x2c, 0x69, 0x2e, 0x5f, + 0x5f, 0x63, 0x3d, 0x70, 0x74, 0x2c, 0x21, 0x70, 0x74, 0x2e, 0x75, 0x29, + 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x3d, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, + 0x69, 0x66, 0x28, 0x21, 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, 0x5f, + 0x48, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x76, + 0x61, 0x72, 0x20, 0x5f, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, + 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x74, 0x2e, 0x5f, - 0x5f, 0x4e, 0x7d, 0x29, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x21, 0x6f, 0x7c, 0x7c, 0x6f, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, - 0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x3b, 0x76, - 0x61, 0x72, 0x20, 0x72, 0x3d, 0x21, 0x31, 0x3b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, + 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, + 0x5f, 0x63, 0x7d, 0x29, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x5f, 0x2e, 0x65, + 0x76, 0x65, 0x72, 0x79, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, + 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x7d, 0x29, 0x29, 0x29, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x21, 0x6f, 0x7c, 0x7c, 0x6f, 0x2e, 0x63, 0x61, + 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, 0x2c, + 0x65, 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x72, 0x3d, 0x21, 0x31, 0x3b, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x2e, 0x66, 0x6f, 0x72, + 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x2e, 0x5f, + 0x5f, 0x4e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, + 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, + 0x2e, 0x5f, 0x5f, 0x4e, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x76, + 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x21, 0x3d, 0x3d, 0x74, 0x2e, + 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x26, 0x26, 0x28, 0x72, 0x3d, 0x21, 0x30, + 0x29, 0x7d, 0x7d, 0x29, 0x29, 0x2c, 0x21, 0x28, 0x21, 0x72, 0x26, 0x26, + 0x69, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, + 0x3d, 0x3d, 0x74, 0x29, 0x26, 0x26, 0x28, 0x21, 0x6f, 0x7c, 0x7c, 0x6f, + 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, + 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x29, 0x7d, 0x3b, 0x70, 0x74, 0x2e, 0x75, + 0x3d, 0x21, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x70, 0x74, + 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, + 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x2c, 0x72, + 0x3d, 0x70, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, + 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3b, + 0x70, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, + 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, + 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, + 0x5f, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x6f, 0x3b, + 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x28, 0x74, + 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x2c, 0x6f, 0x3d, 0x69, 0x7d, 0x72, 0x26, + 0x26, 0x72, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, + 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7d, 0x2c, 0x70, 0x74, 0x2e, + 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, + 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x5f, 0x7d, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, + 0x7c, 0x7c, 0x69, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x48, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, + 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b, + 0x2b, 0x2c, 0x33, 0x29, 0x3b, 0x21, 0x77, 0x2e, 0x5f, 0x5f, 0x73, 0x26, + 0x26, 0x6a, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, 0x29, + 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x65, 0x2e, + 0x69, 0x3d, 0x6e, 0x2c, 0x70, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, + 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x65, 0x29, 0x29, 0x7d, + 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4e, 0x74, 0x28, + 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x43, + 0x74, 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x34, 0x29, 0x3b, 0x21, 0x77, + 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x6a, 0x74, 0x28, 0x65, 0x2e, 0x5f, + 0x5f, 0x48, 0x2c, 0x6e, 0x29, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, + 0x3d, 0x74, 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, 0x2c, 0x70, 0x74, 0x2e, + 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x65, 0x29, 0x29, + 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x50, 0x74, + 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x79, + 0x74, 0x3d, 0x35, 0x2c, 0x44, 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x7b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3a, 0x74, 0x7d, + 0x7d, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x24, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, + 0x29, 0x7b, 0x79, 0x74, 0x3d, 0x36, 0x2c, 0x4e, 0x74, 0x28, 0x28, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, + 0x3f, 0x28, 0x74, 0x28, 0x6e, 0x28, 0x29, 0x29, 0x2c, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x74, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x29, + 0x3a, 0x74, 0x3f, 0x28, 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, + 0x74, 0x3d, 0x6e, 0x28, 0x29, 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x6e, 0x75, + 0x6c, 0x6c, 0x7d, 0x29, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, + 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x65, 0x3a, + 0x65, 0x2e, 0x63, 0x6f, 0x6e, 0x63, 0x61, 0x74, 0x28, 0x74, 0x29, 0x29, + 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x44, 0x74, + 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, + 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x37, 0x29, 0x3b, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6a, 0x74, 0x28, 0x65, 0x2e, 0x5f, + 0x5f, 0x48, 0x2c, 0x6e, 0x29, 0x3f, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x56, + 0x3d, 0x74, 0x28, 0x29, 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, 0x2c, 0x65, + 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x74, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x56, + 0x29, 0x3a, 0x65, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x54, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x79, 0x74, 0x3d, 0x38, 0x2c, + 0x44, 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x7d, + 0x29, 0x2c, 0x6e, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x56, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, + 0x6e, 0x3d, 0x70, 0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, + 0x5b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x5d, 0x2c, 0x65, 0x3d, 0x43, 0x74, + 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x39, 0x29, 0x3b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x63, 0x3d, 0x74, 0x2c, 0x6e, 0x3f, + 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x26, + 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x21, 0x30, 0x2c, 0x6e, 0x2e, + 0x73, 0x75, 0x62, 0x28, 0x70, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x2e, 0x70, + 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3a, + 0x74, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x41, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x77, 0x2e, + 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67, 0x56, 0x61, 0x6c, 0x75, + 0x65, 0x26, 0x26, 0x77, 0x2e, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, + 0x67, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x28, 0x6e, 0x3f, 0x6e, 0x28, 0x74, + 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x46, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, + 0x6e, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x31, 0x30, + 0x29, 0x2c, 0x65, 0x3d, 0x45, 0x74, 0x28, 0x29, 0x3b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x70, + 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, + 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, 0x68, 0x7c, 0x7c, 0x28, 0x70, 0x74, + 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, + 0x64, 0x43, 0x61, 0x74, 0x63, 0x68, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x69, 0x29, 0x7b, 0x6e, 0x2e, 0x5f, + 0x5f, 0x26, 0x26, 0x6e, 0x2e, 0x5f, 0x5f, 0x28, 0x74, 0x2c, 0x69, 0x29, + 0x2c, 0x65, 0x5b, 0x31, 0x5d, 0x28, 0x74, 0x29, 0x7d, 0x29, 0x2c, 0x5b, + 0x65, 0x5b, 0x30, 0x5d, 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x28, 0x29, 0x7b, 0x65, 0x5b, 0x31, 0x5d, 0x28, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x30, 0x29, 0x7d, 0x5d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x74, 0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, + 0x20, 0x74, 0x3d, 0x43, 0x74, 0x28, 0x61, 0x74, 0x2b, 0x2b, 0x2c, 0x31, + 0x31, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x29, + 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x70, + 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x3b, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, + 0x3d, 0x6e, 0x26, 0x26, 0x21, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x26, 0x26, + 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x3b, + 0x29, 0x6e, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x3b, 0x76, 0x61, 0x72, 0x20, + 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x7c, 0x7c, 0x28, 0x6e, 0x2e, + 0x5f, 0x5f, 0x6d, 0x3d, 0x5b, 0x30, 0x2c, 0x30, 0x5d, 0x29, 0x3b, 0x74, + 0x2e, 0x5f, 0x5f, 0x3d, 0x22, 0x50, 0x22, 0x2b, 0x65, 0x5b, 0x30, 0x5d, + 0x2b, 0x22, 0x2d, 0x22, 0x2b, 0x65, 0x5b, 0x31, 0x5d, 0x2b, 0x2b, 0x7d, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x7d, + 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x57, 0x74, 0x28, + 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3b, + 0x74, 0x3d, 0x6d, 0x74, 0x2e, 0x73, 0x68, 0x69, 0x66, 0x74, 0x28, 0x29, + 0x3b, 0x29, 0x69, 0x66, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x50, 0x26, 0x26, + 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x2e, + 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, + 0x61, 0x63, 0x68, 0x28, 0x52, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, + 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, + 0x68, 0x28, 0x49, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, + 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, + 0x28, 0x75, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, + 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x77, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x75, + 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x7d, 0x7d, 0x77, 0x2e, 0x5f, + 0x5f, 0x62, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, + 0x74, 0x29, 0x7b, 0x70, 0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x62, + 0x74, 0x26, 0x26, 0x62, 0x74, 0x28, 0x74, 0x29, 0x7d, 0x2c, 0x77, 0x2e, + 0x5f, 0x5f, 0x72, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x28, 0x74, 0x29, 0x7b, 0x6b, 0x74, 0x26, 0x26, 0x6b, 0x74, 0x28, 0x74, + 0x29, 0x2c, 0x61, 0x74, 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, + 0x3d, 0x28, 0x70, 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x2e, + 0x5f, 0x5f, 0x48, 0x3b, 0x6e, 0x26, 0x26, 0x28, 0x64, 0x74, 0x3d, 0x3d, + 0x3d, 0x70, 0x74, 0x3f, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, + 0x5d, 0x2c, 0x70, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, + 0x6e, 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x5b, 0x30, - 0x5d, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, - 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x2c, 0x6e, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x5b, 0x30, - 0x5d, 0x26, 0x26, 0x28, 0x72, 0x3d, 0x21, 0x30, 0x29, 0x7d, 0x7d, 0x29, - 0x29, 0x2c, 0x21, 0x28, 0x21, 0x72, 0x26, 0x26, 0x69, 0x2e, 0x5f, 0x5f, - 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x3d, 0x3d, 0x74, 0x29, - 0x26, 0x26, 0x28, 0x21, 0x6f, 0x7c, 0x7c, 0x6f, 0x2e, 0x63, 0x61, 0x6c, - 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65, - 0x29, 0x29, 0x7d, 0x3b, 0x68, 0x74, 0x2e, 0x75, 0x3d, 0x21, 0x30, 0x3b, - 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x68, 0x74, 0x2e, 0x73, 0x68, 0x6f, - 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, - 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x2c, 0x72, 0x3d, 0x68, 0x74, 0x2e, - 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, - 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3b, 0x68, 0x74, 0x2e, 0x63, - 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, - 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x69, - 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x6f, 0x3b, 0x6f, 0x3d, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, - 0x29, 0x2c, 0x6f, 0x3d, 0x69, 0x7d, 0x72, 0x26, 0x26, 0x72, 0x2e, 0x63, - 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, - 0x2c, 0x65, 0x29, 0x7d, 0x2c, 0x68, 0x74, 0x2e, 0x73, 0x68, 0x6f, 0x75, - 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x5f, 0x7d, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x69, 0x2e, 0x5f, 0x5f, 0x4e, 0x7c, 0x7c, 0x69, 0x2e, - 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x45, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, - 0x65, 0x3d, 0x78, 0x74, 0x28, 0x63, 0x74, 0x2b, 0x2b, 0x2c, 0x33, 0x29, - 0x3b, 0x21, 0x53, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x52, 0x74, 0x28, - 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, 0x29, 0x26, 0x26, 0x28, 0x65, - 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, 0x2c, - 0x68, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, - 0x75, 0x73, 0x68, 0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x55, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, - 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x78, 0x74, 0x28, 0x63, 0x74, - 0x2b, 0x2b, 0x2c, 0x34, 0x29, 0x3b, 0x21, 0x53, 0x2e, 0x5f, 0x5f, 0x73, - 0x26, 0x26, 0x52, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, - 0x29, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x65, - 0x2e, 0x69, 0x3d, 0x6e, 0x2c, 0x68, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, - 0x70, 0x75, 0x73, 0x68, 0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x48, 0x74, 0x28, 0x74, 0x29, 0x7b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x64, 0x74, 0x3d, 0x35, 0x2c, - 0x50, 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3a, 0x74, 0x7d, 0x7d, 0x29, 0x2c, 0x5b, - 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x4e, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x64, 0x74, - 0x3d, 0x36, 0x2c, 0x55, 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, - 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x3f, 0x28, 0x74, 0x28, - 0x6e, 0x28, 0x29, 0x29, 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x29, 0x3a, 0x74, 0x3f, 0x28, - 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x6e, 0x28, - 0x29, 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, - 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x29, - 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x29, 0x2c, 0x6e, 0x75, - 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x65, 0x3a, 0x65, 0x2e, 0x63, 0x6f, - 0x6e, 0x63, 0x61, 0x74, 0x28, 0x74, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x50, 0x74, 0x28, 0x74, 0x2c, 0x6e, - 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x78, 0x74, 0x28, 0x63, - 0x74, 0x2b, 0x2b, 0x2c, 0x37, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x52, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, - 0x29, 0x3f, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x56, 0x3d, 0x74, 0x28, 0x29, - 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x68, - 0x3d, 0x74, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x56, 0x29, 0x3a, 0x65, 0x2e, - 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x44, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x64, 0x74, 0x3d, 0x38, 0x2c, 0x50, 0x74, 0x28, 0x28, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x7d, 0x29, 0x2c, 0x6e, 0x29, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x24, 0x74, - 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x68, 0x74, - 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5b, 0x74, 0x2e, 0x5f, - 0x5f, 0x63, 0x5d, 0x2c, 0x65, 0x3d, 0x78, 0x74, 0x28, 0x63, 0x74, 0x2b, - 0x2b, 0x2c, 0x39, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x65, 0x2e, 0x63, 0x3d, 0x74, 0x2c, 0x6e, 0x3f, 0x28, 0x6e, 0x75, 0x6c, - 0x6c, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x28, 0x65, 0x2e, - 0x5f, 0x5f, 0x3d, 0x21, 0x30, 0x2c, 0x6e, 0x2e, 0x73, 0x75, 0x62, 0x28, - 0x68, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3a, 0x74, 0x2e, 0x5f, 0x5f, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x54, 0x74, - 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x53, 0x2e, 0x75, 0x73, 0x65, 0x44, - 0x65, 0x62, 0x75, 0x67, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x26, 0x26, 0x53, - 0x2e, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67, 0x56, 0x61, 0x6c, - 0x75, 0x65, 0x28, 0x6e, 0x3f, 0x6e, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x56, 0x74, - 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x78, 0x74, - 0x28, 0x63, 0x74, 0x2b, 0x2b, 0x2c, 0x31, 0x30, 0x29, 0x2c, 0x65, 0x3d, - 0x77, 0x74, 0x28, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x6e, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x68, 0x74, 0x2e, 0x63, 0x6f, - 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, - 0x74, 0x63, 0x68, 0x7c, 0x7c, 0x28, 0x68, 0x74, 0x2e, 0x63, 0x6f, 0x6d, - 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, - 0x63, 0x68, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x74, 0x2c, 0x69, 0x29, 0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x6e, - 0x2e, 0x5f, 0x5f, 0x28, 0x74, 0x2c, 0x69, 0x29, 0x2c, 0x65, 0x5b, 0x31, - 0x5d, 0x28, 0x74, 0x29, 0x7d, 0x29, 0x2c, 0x5b, 0x65, 0x5b, 0x30, 0x5d, - 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, - 0x65, 0x5b, 0x31, 0x5d, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x29, - 0x7d, 0x5d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x41, 0x74, 0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3d, 0x78, - 0x74, 0x28, 0x63, 0x74, 0x2b, 0x2b, 0x2c, 0x31, 0x31, 0x29, 0x3b, 0x69, - 0x66, 0x28, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x7b, 0x66, 0x6f, 0x72, - 0x28, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x68, 0x74, 0x2e, 0x5f, 0x5f, - 0x76, 0x3b, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, - 0x21, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, - 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x3b, 0x29, 0x6e, 0x3d, 0x6e, - 0x2e, 0x5f, 0x5f, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x6e, 0x2e, - 0x5f, 0x5f, 0x6d, 0x7c, 0x7c, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x3d, - 0x5b, 0x30, 0x2c, 0x30, 0x5d, 0x29, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, - 0x22, 0x50, 0x22, 0x2b, 0x65, 0x5b, 0x30, 0x5d, 0x2b, 0x22, 0x2d, 0x22, - 0x2b, 0x65, 0x5b, 0x31, 0x5d, 0x2b, 0x2b, 0x7d, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x46, 0x74, 0x28, 0x29, 0x7b, 0x66, 0x6f, - 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3b, 0x74, 0x3d, 0x76, 0x74, - 0x2e, 0x73, 0x68, 0x69, 0x66, 0x74, 0x28, 0x29, 0x3b, 0x29, 0x69, 0x66, - 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x50, 0x26, 0x26, 0x74, 0x2e, 0x5f, 0x5f, - 0x48, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, - 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, - 0x4f, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, - 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x4c, 0x74, - 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, - 0x5b, 0x5d, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x75, 0x29, 0x7b, - 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, - 0x2c, 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x75, 0x2c, 0x74, 0x2e, 0x5f, - 0x5f, 0x76, 0x29, 0x7d, 0x7d, 0x53, 0x2e, 0x5f, 0x5f, 0x62, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x68, - 0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6d, 0x74, 0x26, 0x26, 0x6d, - 0x74, 0x28, 0x74, 0x29, 0x7d, 0x2c, 0x53, 0x2e, 0x5f, 0x5f, 0x72, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, - 0x67, 0x74, 0x26, 0x26, 0x67, 0x74, 0x28, 0x74, 0x29, 0x2c, 0x63, 0x74, - 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x28, 0x68, 0x74, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x48, 0x3b, - 0x6e, 0x26, 0x26, 0x28, 0x61, 0x74, 0x3d, 0x3d, 0x3d, 0x68, 0x74, 0x3f, - 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x68, 0x74, - 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, - 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, - 0x5f, 0x5f, 0x4e, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, - 0x2e, 0x5f, 0x5f, 0x4e, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x3d, - 0x79, 0x74, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x74, 0x2e, 0x69, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x29, 0x29, 0x29, 0x3a, - 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, - 0x63, 0x68, 0x28, 0x4f, 0x74, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, - 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x4c, 0x74, 0x29, - 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x63, 0x74, - 0x3d, 0x30, 0x29, 0x29, 0x2c, 0x61, 0x74, 0x3d, 0x68, 0x74, 0x7d, 0x2c, - 0x53, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x3d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x62, 0x74, 0x26, - 0x26, 0x62, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x6e, 0x26, 0x26, 0x6e, 0x2e, - 0x5f, 0x5f, 0x48, 0x26, 0x26, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, - 0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, - 0x28, 0x31, 0x21, 0x3d, 0x3d, 0x76, 0x74, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x6e, 0x29, 0x26, 0x26, 0x70, 0x74, 0x3d, 0x3d, 0x3d, 0x53, 0x2e, - 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, - 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x7c, 0x7c, 0x28, - 0x28, 0x70, 0x74, 0x3d, 0x53, 0x2e, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, - 0x61, 0x6d, 0x65, 0x29, 0x7c, 0x7c, 0x57, 0x74, 0x29, 0x28, 0x46, 0x74, - 0x29, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e, - 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x69, - 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x74, 0x2e, 0x69, - 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x21, 0x3d, 0x3d, 0x79, 0x74, - 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, - 0x56, 0x29, 0x2c, 0x74, 0x2e, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x3d, 0x79, 0x74, 0x7d, 0x29, - 0x29, 0x29, 0x2c, 0x61, 0x74, 0x3d, 0x68, 0x74, 0x3d, 0x6e, 0x75, 0x6c, - 0x6c, 0x7d, 0x2c, 0x53, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x6e, - 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, 0x74, + 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x26, 0x26, 0x28, 0x74, 0x2e, + 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x29, 0x2c, 0x74, 0x2e, + 0x5f, 0x5f, 0x56, 0x3d, 0x67, 0x74, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, + 0x3d, 0x74, 0x2e, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, + 0x29, 0x29, 0x29, 0x3a, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, + 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x52, 0x74, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, - 0x28, 0x4f, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x74, - 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x28, + 0x28, 0x49, 0x74, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, + 0x5d, 0x2c, 0x61, 0x74, 0x3d, 0x30, 0x29, 0x29, 0x2c, 0x64, 0x74, 0x3d, + 0x70, 0x74, 0x7d, 0x2c, 0x77, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, + 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, + 0x7b, 0x53, 0x74, 0x26, 0x26, 0x53, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x76, + 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x6e, + 0x26, 0x26, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, 0x26, 0x26, 0x28, 0x6e, 0x2e, + 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x26, 0x26, 0x28, 0x31, 0x21, 0x3d, 0x3d, 0x6d, 0x74, 0x2e, + 0x70, 0x75, 0x73, 0x68, 0x28, 0x6e, 0x29, 0x26, 0x26, 0x76, 0x74, 0x3d, + 0x3d, 0x3d, 0x77, 0x2e, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, + 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, + 0x65, 0x7c, 0x7c, 0x28, 0x28, 0x76, 0x74, 0x3d, 0x77, 0x2e, 0x72, 0x65, + 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, + 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x29, 0x7c, 0x7c, 0x4c, 0x74, + 0x29, 0x28, 0x57, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, + 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, - 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x74, 0x2e, 0x5f, 0x5f, - 0x7c, 0x7c, 0x4c, 0x74, 0x28, 0x74, 0x29, 0x7d, 0x29, 0x29, 0x7d, 0x63, - 0x61, 0x74, 0x63, 0x68, 0x28, 0x73, 0x29, 0x7b, 0x6e, 0x2e, 0x73, 0x6f, - 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x26, 0x26, 0x28, - 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x29, 0x7d, 0x29, 0x29, - 0x2c, 0x6e, 0x3d, 0x5b, 0x5d, 0x2c, 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, - 0x73, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x7d, 0x7d, 0x29, 0x29, - 0x2c, 0x6b, 0x74, 0x26, 0x26, 0x6b, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, - 0x7d, 0x2c, 0x53, 0x2e, 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, - 0x53, 0x74, 0x26, 0x26, 0x53, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, - 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, - 0x65, 0x26, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x26, 0x26, 0x28, 0x65, - 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, - 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, 0x4f, 0x74, 0x28, - 0x74, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, - 0x6e, 0x3d, 0x74, 0x7d, 0x7d, 0x29, 0x29, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, - 0x48, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x26, 0x26, - 0x53, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x6e, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, - 0x76, 0x29, 0x29, 0x7d, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x4d, 0x74, 0x3d, - 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, - 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, - 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, - 0x72, 0x61, 0x6d, 0x65, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x57, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, - 0x6e, 0x2c, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x7b, 0x63, 0x6c, 0x65, 0x61, 0x72, 0x54, 0x69, 0x6d, 0x65, - 0x6f, 0x75, 0x74, 0x28, 0x69, 0x29, 0x2c, 0x4d, 0x74, 0x26, 0x26, 0x63, - 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x28, 0x6e, 0x29, 0x2c, 0x73, - 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x74, 0x29, - 0x7d, 0x2c, 0x69, 0x3d, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, - 0x75, 0x74, 0x28, 0x65, 0x2c, 0x31, 0x30, 0x30, 0x29, 0x3b, 0x4d, 0x74, - 0x26, 0x26, 0x28, 0x6e, 0x3d, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, - 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, - 0x6d, 0x65, 0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x4f, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, - 0x72, 0x20, 0x6e, 0x3d, 0x68, 0x74, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x5f, - 0x5f, 0x63, 0x3b, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x26, - 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x2c, 0x65, 0x28, 0x29, 0x29, 0x2c, 0x68, 0x74, 0x3d, 0x6e, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4c, 0x74, - 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x68, 0x74, - 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x28, - 0x29, 0x2c, 0x68, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x52, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x74, 0x7c, 0x7c, 0x74, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x6c, - 0x65, 0x6e, 0x67, 0x74, 0x68, 0x7c, 0x7c, 0x6e, 0x2e, 0x73, 0x6f, 0x6d, - 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x6e, 0x21, 0x3d, 0x3d, 0x74, 0x5b, 0x65, 0x5d, 0x7d, 0x29, 0x29, 0x7d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x49, 0x74, 0x28, - 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, - 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x3f, 0x6e, 0x28, 0x74, 0x29, - 0x3a, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x6a, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x53, 0x5b, 0x74, 0x5d, - 0x3d, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x6e, 0x75, 0x6c, 0x6c, - 0x2c, 0x53, 0x5b, 0x74, 0x5d, 0x7c, 0x7c, 0x28, 0x28, 0x29, 0x3d, 0x3e, - 0x7b, 0x7d, 0x29, 0x29, 0x7d, 0x6c, 0x65, 0x74, 0x20, 0x71, 0x74, 0x2c, - 0x42, 0x74, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x47, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x42, 0x74, 0x29, - 0x42, 0x74, 0x28, 0x29, 0x3b, 0x42, 0x74, 0x3d, 0x74, 0x26, 0x26, 0x74, - 0x2e, 0x53, 0x28, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x7a, 0x74, 0x28, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x74, - 0x7d, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x4b, - 0x74, 0x28, 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3d, 0x74, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x50, - 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, - 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x3b, 0x77, 0x68, - 0x69, 0x6c, 0x65, 0x28, 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x69, - 0x66, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x7b, 0x74, 0x2e, 0x5f, - 0x5f, 0x63, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x34, 0x3b, 0x62, - 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, - 0x24, 0x75, 0x2e, 0x63, 0x3d, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x2e, 0x64, 0x61, 0x74, 0x61, - 0x3d, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29, 0x7d, 0x3b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x64, 0x28, 0x28, 0x29, 0x3d, 0x3e, + 0x7b, 0x74, 0x2e, 0x69, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x48, + 0x3d, 0x74, 0x2e, 0x69, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x21, + 0x3d, 0x3d, 0x67, 0x74, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, + 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x29, 0x2c, 0x74, 0x2e, 0x69, 0x3d, 0x76, + 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x3d, + 0x67, 0x74, 0x7d, 0x29, 0x29, 0x29, 0x2c, 0x64, 0x74, 0x3d, 0x70, 0x74, + 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x2c, 0x77, 0x2e, 0x5f, 0x5f, 0x63, + 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, + 0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, + 0x72, 0x79, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, + 0x45, 0x61, 0x63, 0x68, 0x28, 0x52, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x5f, + 0x5f, 0x68, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x69, 0x6c, + 0x74, 0x65, 0x72, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, + 0x74, 0x2e, 0x5f, 0x5f, 0x7c, 0x7c, 0x49, 0x74, 0x28, 0x74, 0x29, 0x7d, + 0x29, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x73, 0x29, 0x7b, + 0x6e, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, + 0x68, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, + 0x29, 0x7d, 0x29, 0x29, 0x2c, 0x6e, 0x3d, 0x5b, 0x5d, 0x2c, 0x77, 0x2e, + 0x5f, 0x5f, 0x65, 0x28, 0x73, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x29, + 0x7d, 0x7d, 0x29, 0x29, 0x2c, 0x78, 0x74, 0x26, 0x26, 0x78, 0x74, 0x28, + 0x74, 0x2c, 0x6e, 0x29, 0x7d, 0x2c, 0x77, 0x2e, 0x75, 0x6e, 0x6d, 0x6f, + 0x75, 0x6e, 0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x28, 0x74, 0x29, 0x7b, 0x77, 0x74, 0x26, 0x26, 0x77, 0x74, 0x28, 0x74, + 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x74, 0x2e, + 0x5f, 0x5f, 0x63, 0x3b, 0x65, 0x26, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x48, + 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e, + 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x72, 0x79, + 0x7b, 0x52, 0x74, 0x28, 0x74, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, + 0x28, 0x74, 0x29, 0x7b, 0x6e, 0x3d, 0x74, 0x7d, 0x7d, 0x29, 0x29, 0x2c, + 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, + 0x2c, 0x6e, 0x26, 0x26, 0x77, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x6e, 0x2c, + 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x29, 0x7d, 0x3b, 0x76, 0x61, 0x72, + 0x20, 0x4f, 0x74, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x72, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x3b, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4c, 0x74, 0x28, 0x74, 0x29, 0x7b, + 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, 0x6c, 0x65, 0x61, 0x72, + 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x69, 0x29, 0x2c, 0x4f, + 0x74, 0x26, 0x26, 0x63, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x41, 0x6e, 0x69, + 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x28, + 0x6e, 0x29, 0x2c, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, + 0x74, 0x28, 0x74, 0x29, 0x7d, 0x2c, 0x69, 0x3d, 0x73, 0x65, 0x74, 0x54, + 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x65, 0x2c, 0x31, 0x30, 0x30, + 0x29, 0x3b, 0x4f, 0x74, 0x26, 0x26, 0x28, 0x6e, 0x3d, 0x72, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x52, 0x74, 0x28, 0x74, + 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x70, 0x74, 0x2c, 0x65, + 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x22, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, + 0x66, 0x20, 0x65, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x65, 0x28, 0x29, 0x29, 0x2c, + 0x70, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, + 0x6e, 0x20, 0x49, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, + 0x6e, 0x3d, 0x70, 0x74, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x74, + 0x2e, 0x5f, 0x5f, 0x28, 0x29, 0x2c, 0x70, 0x74, 0x3d, 0x6e, 0x7d, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6a, 0x74, 0x28, 0x74, + 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x74, + 0x7c, 0x7c, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x21, 0x3d, + 0x3d, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x7c, 0x7c, 0x6e, + 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x28, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x21, 0x3d, 0x3d, 0x74, 0x5b, 0x65, 0x5d, + 0x7d, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x20, 0x42, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x3f, + 0x6e, 0x28, 0x74, 0x29, 0x3a, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x71, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, + 0x77, 0x5b, 0x74, 0x5d, 0x3d, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, + 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x77, 0x5b, 0x74, 0x5d, 0x7c, 0x7c, 0x28, + 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x7d, 0x29, 0x29, 0x7d, 0x6c, 0x65, 0x74, + 0x20, 0x47, 0x74, 0x2c, 0x7a, 0x74, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x20, 0x4a, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, + 0x28, 0x7a, 0x74, 0x29, 0x7a, 0x74, 0x28, 0x29, 0x3b, 0x7a, 0x74, 0x3d, + 0x74, 0x26, 0x26, 0x74, 0x2e, 0x53, 0x28, 0x29, 0x7d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4b, 0x74, 0x28, 0x7b, 0x64, 0x61, + 0x74, 0x61, 0x3a, 0x74, 0x7d, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x6e, 0x3d, 0x58, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x76, + 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x74, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x65, 0x3d, 0x44, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x6c, + 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, + 0x76, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x74, 0x3d, 0x74, 0x2e, + 0x5f, 0x5f, 0x29, 0x69, 0x66, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, + 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, + 0x3d, 0x34, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x2e, 0x63, 0x3d, 0x28, 0x29, 0x3d, + 0x3e, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3b, 0x69, 0x66, 0x28, 0x21, + 0x45, 0x28, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29, 0x29, 0x26, + 0x26, 0x33, 0x3d, 0x3d, 0x3d, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, + 0x28, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x62, 0x61, 0x73, 0x65, + 0x29, 0x3f, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3a, 0x74, 0x2e, 0x6e, + 0x6f, 0x64, 0x65, 0x54, 0x79, 0x70, 0x65, 0x29, 0x29, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, + 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29, 0x3b, 0x65, 0x6c, 0x73, + 0x65, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, + 0x3d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x65, 0x74, 0x53, + 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29, 0x7d, 0x7d, 0x3b, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x79, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x3f, 0x30, 0x3a, 0x21, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x3f, 0x22, 0x22, 0x3a, 0x74, 0x7c, 0x7c, 0x22, 0x22, 0x7d, 0x29, 0x7d, 0x2c, 0x5b, 0x5d, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x7d, 0x7a, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, + 0x65, 0x7d, 0x4b, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x4e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x5f, 0x73, 0x74, 0x22, 0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, - 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x28, 0x66, + 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x28, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x2c, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c, - 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x7a, + 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x4b, 0x74, 0x7d, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x67, 0x65, 0x74, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, @@ -1560,7 +1583,7 @@ unsigned char index_js[] = { 0x7d, 0x7d, 0x7d, 0x2c, 0x5f, 0x5f, 0x62, 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x31, 0x7d, 0x7d, 0x29, 0x3b, - 0x6a, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x62, 0x22, 0x2c, 0x28, 0x74, 0x2c, + 0x71, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x62, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x7b, 0x6c, 0x65, 0x74, @@ -1570,34 +1593,34 @@ unsigned char index_js[] = { 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x63, 0x6f, 0x6e, 0x74, 0x69, 0x6e, 0x75, 0x65, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x3d, 0x65, 0x5b, 0x69, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x5f, 0x20, - 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x6f, 0x66, 0x20, 0x66, + 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x6f, 0x66, 0x20, 0x63, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x74, 0x29, 0x6e, 0x2e, 0x5f, 0x5f, 0x6e, 0x70, 0x3d, 0x74, 0x3d, 0x7b, 0x7d, 0x3b, 0x74, 0x5b, 0x69, 0x5d, 0x3d, 0x5f, 0x3b, 0x65, 0x5b, 0x69, 0x5d, 0x3d, 0x5f, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d, - 0x29, 0x3b, 0x6a, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x72, 0x22, 0x2c, 0x28, - 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x47, 0x74, 0x28, 0x29, 0x3b, + 0x29, 0x3b, 0x71, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x72, 0x22, 0x2c, 0x28, + 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x4a, 0x74, 0x28, 0x29, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x2c, 0x69, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x69, 0x66, 0x28, 0x69, 0x29, 0x7b, 0x69, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x65, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x65, 0x29, 0x69, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3d, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3b, 0x62, 0x28, 0x28, + 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3b, 0x53, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x7d, 0x29, 0x29, 0x3b, 0x6e, 0x2e, 0x63, 0x3d, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x69, 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29, 0x7d, 0x3b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x28, 0x29, 0x7d, 0x71, 0x74, 0x3d, 0x69, - 0x3b, 0x47, 0x74, 0x28, 0x65, 0x29, 0x3b, 0x74, 0x28, 0x6e, 0x29, 0x7d, - 0x29, 0x3b, 0x6a, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x65, 0x22, 0x2c, 0x28, - 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x3d, 0x3e, 0x7b, 0x47, - 0x74, 0x28, 0x29, 0x3b, 0x71, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, + 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x28, 0x29, 0x7d, 0x47, 0x74, 0x3d, 0x69, + 0x3b, 0x4a, 0x74, 0x28, 0x65, 0x29, 0x3b, 0x74, 0x28, 0x6e, 0x29, 0x7d, + 0x29, 0x3b, 0x71, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x65, 0x22, 0x2c, 0x28, + 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x3d, 0x3e, 0x7b, 0x4a, + 0x74, 0x28, 0x29, 0x3b, 0x47, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x28, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7d, 0x29, - 0x3b, 0x6a, 0x74, 0x28, 0x22, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x22, - 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x47, 0x74, 0x28, - 0x29, 0x3b, 0x71, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, + 0x3b, 0x71, 0x74, 0x28, 0x22, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x22, + 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x4a, 0x74, 0x28, + 0x29, 0x3b, 0x47, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x28, 0x65, @@ -1617,19 +1640,19 @@ unsigned char index_js[] = { 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6f, 0x3d, 0x6e, 0x5b, 0x5f, 0x5d, 0x2c, 0x72, 0x3d, 0x74, 0x5b, 0x5f, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6f, 0x29, 0x7b, 0x6f, 0x3d, - 0x4a, 0x74, 0x28, 0x65, 0x2c, 0x5f, 0x2c, 0x72, 0x2c, 0x69, 0x29, 0x3b, + 0x51, 0x74, 0x28, 0x65, 0x2c, 0x5f, 0x2c, 0x72, 0x2c, 0x69, 0x29, 0x3b, 0x6e, 0x5b, 0x5f, 0x5d, 0x3d, 0x6f, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x6f, 0x2e, 0x6f, 0x28, 0x72, 0x2c, 0x69, 0x29, 0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x4a, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, + 0x6f, 0x6e, 0x20, 0x51, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x6f, 0x77, 0x6e, 0x65, 0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x6f, 0x3d, - 0x73, 0x28, 0x65, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b, + 0x68, 0x28, 0x65, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b, 0x6f, 0x3a, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x6f, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x74, 0x3b, 0x69, 0x3d, 0x6e, 0x7d, - 0x2c, 0x64, 0x3a, 0x62, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x63, 0x6f, + 0x2c, 0x64, 0x3a, 0x53, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6f, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x69, 0x5b, 0x6e, 0x5d, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x69, 0x5b, 0x6e, @@ -1639,7 +1662,7 @@ unsigned char index_js[] = { 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x2c, 0x65, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x29, 0x7d, - 0x7d, 0x29, 0x7d, 0x7d, 0x6a, 0x74, 0x28, 0x22, 0x75, 0x6e, 0x6d, 0x6f, + 0x7d, 0x29, 0x7d, 0x7d, 0x71, 0x74, 0x28, 0x22, 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x2e, 0x74, @@ -1656,196 +1679,198 @@ unsigned char index_js[] = { 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x6e, 0x2e, 0x64, 0x28, 0x29, - 0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x6a, 0x74, + 0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x71, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x68, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28, 0x69, 0x3c, - 0x33, 0x29, 0x6e, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x32, 0x3b, - 0x74, 0x28, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7d, 0x29, 0x3b, 0x4c, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, - 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, - 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x65, 0x26, - 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, - 0x73, 0x7c, 0x7c, 0x34, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, - 0x24, 0x66, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, - 0x3b, 0x69, 0x66, 0x28, 0x33, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x5f, 0x24, 0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, - 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, - 0x6e, 0x20, 0x6e, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, - 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, - 0x6e, 0x20, 0x74, 0x29, 0x69, 0x66, 0x28, 0x22, 0x5f, 0x5f, 0x73, 0x6f, - 0x75, 0x72, 0x63, 0x65, 0x22, 0x21, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x74, - 0x5b, 0x69, 0x5d, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, - 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x69, 0x5d, 0x29, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, - 0x20, 0x69, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, - 0x72, 0x6f, 0x70, 0x73, 0x29, 0x69, 0x66, 0x28, 0x21, 0x28, 0x69, 0x20, - 0x69, 0x6e, 0x20, 0x74, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x21, 0x30, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x7d, - 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4b, 0x74, - 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x50, - 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x73, 0x28, 0x74, 0x29, 0x2c, 0x5b, + 0x33, 0x7c, 0x7c, 0x39, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x6e, 0x2e, 0x5f, + 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x32, 0x3b, 0x74, 0x28, 0x6e, 0x2c, 0x65, + 0x2c, 0x69, 0x29, 0x7d, 0x29, 0x3b, 0x49, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, + 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, + 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x65, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3b, + 0x69, 0x66, 0x28, 0x21, 0x28, 0x65, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, + 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x73, 0x7c, 0x7c, 0x34, 0x26, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x29, 0x29, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x69, 0x66, 0x28, 0x33, + 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x29, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, + 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, + 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, 0x69, + 0x66, 0x28, 0x22, 0x5f, 0x5f, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x22, + 0x21, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x74, 0x5b, 0x69, 0x5d, 0x21, 0x3d, + 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x5b, + 0x69, 0x5d, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, + 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, 0x6e, + 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, + 0x69, 0x66, 0x28, 0x21, 0x28, 0x69, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, + 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x7d, 0x3b, 0x66, 0x75, 0x6e, 0x63, + 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x58, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x72, + 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x44, 0x74, 0x28, 0x28, 0x29, 0x3d, + 0x3e, 0x68, 0x28, 0x74, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66, 0x75, + 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x59, 0x74, 0x28, 0x74, 0x29, + 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x50, 0x74, 0x28, + 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, + 0x3d, 0x74, 0x3b, 0x47, 0x74, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, + 0x34, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x44, 0x74, 0x28, + 0x28, 0x29, 0x3d, 0x3e, 0x79, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x6e, 0x2e, + 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x28, 0x29, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x51, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6e, 0x3d, 0x48, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74, 0x3b, 0x71, 0x74, 0x2e, 0x5f, - 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x34, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x50, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x64, 0x28, 0x28, - 0x29, 0x3d, 0x3e, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, - 0x28, 0x29, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x58, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x48, 0x74, 0x28, 0x74, 0x29, - 0x3b, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74, - 0x3b, 0x45, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x62, 0x28, 0x28, 0x29, - 0x3d, 0x3e, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x28, - 0x29, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x59, - 0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, - 0x5f, 0x3b, 0x6e, 0x5b, 0x30, 0x5d, 0x3d, 0x30, 0x3b, 0x66, 0x6f, 0x72, - 0x28, 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x31, 0x3b, 0x6f, 0x3c, 0x6e, - 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6f, 0x2b, 0x2b, 0x29, - 0x7b, 0x76, 0x61, 0x72, 0x20, 0x72, 0x3d, 0x6e, 0x5b, 0x6f, 0x2b, 0x2b, - 0x5d, 0x2c, 0x75, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3f, 0x28, 0x6e, 0x5b, - 0x30, 0x5d, 0x7c, 0x3d, 0x72, 0x3f, 0x31, 0x3a, 0x32, 0x2c, 0x65, 0x5b, - 0x6e, 0x5b, 0x6f, 0x2b, 0x2b, 0x5d, 0x5d, 0x29, 0x3a, 0x6e, 0x5b, 0x2b, - 0x2b, 0x6f, 0x5d, 0x3b, 0x33, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x69, 0x5b, - 0x30, 0x5d, 0x3d, 0x75, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x69, - 0x5b, 0x31, 0x5d, 0x3d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x61, - 0x73, 0x73, 0x69, 0x67, 0x6e, 0x28, 0x69, 0x5b, 0x31, 0x5d, 0x7c, 0x7c, - 0x7b, 0x7d, 0x2c, 0x75, 0x29, 0x3a, 0x35, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, - 0x28, 0x69, 0x5b, 0x31, 0x5d, 0x3d, 0x69, 0x5b, 0x31, 0x5d, 0x7c, 0x7c, - 0x7b, 0x7d, 0x29, 0x5b, 0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x3d, - 0x75, 0x3a, 0x36, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x69, 0x5b, 0x31, 0x5d, - 0x5b, 0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x2b, 0x3d, 0x75, 0x2b, - 0x22, 0x22, 0x3a, 0x72, 0x3f, 0x28, 0x5f, 0x3d, 0x74, 0x2e, 0x61, 0x70, - 0x70, 0x6c, 0x79, 0x28, 0x75, 0x2c, 0x59, 0x74, 0x28, 0x74, 0x2c, 0x75, - 0x2c, 0x65, 0x2c, 0x5b, 0x22, 0x22, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x5d, - 0x29, 0x29, 0x2c, 0x69, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x5f, 0x29, - 0x2c, 0x75, 0x5b, 0x30, 0x5d, 0x3f, 0x6e, 0x5b, 0x30, 0x5d, 0x7c, 0x3d, - 0x32, 0x3a, 0x28, 0x6e, 0x5b, 0x6f, 0x2d, 0x32, 0x5d, 0x3d, 0x30, 0x2c, - 0x6e, 0x5b, 0x6f, 0x5d, 0x3d, 0x5f, 0x29, 0x29, 0x3a, 0x69, 0x2e, 0x70, - 0x75, 0x73, 0x68, 0x28, 0x75, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x69, 0x7d, 0x2c, 0x5a, 0x74, 0x3d, 0x6e, 0x65, 0x77, 0x20, - 0x4d, 0x61, 0x70, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x74, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, - 0x3d, 0x5a, 0x74, 0x2e, 0x67, 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7c, 0x7c, - 0x28, 0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x2c, 0x5a, - 0x74, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x6e, - 0x29, 0x29, 0x2c, 0x28, 0x6e, 0x3d, 0x59, 0x74, 0x28, 0x74, 0x68, 0x69, - 0x73, 0x2c, 0x6e, 0x2e, 0x67, 0x65, 0x74, 0x28, 0x74, 0x29, 0x7c, 0x7c, - 0x28, 0x6e, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x66, - 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x2c, 0x69, - 0x3d, 0x31, 0x2c, 0x5f, 0x3d, 0x22, 0x22, 0x2c, 0x6f, 0x3d, 0x22, 0x22, - 0x2c, 0x72, 0x3d, 0x5b, 0x30, 0x5d, 0x2c, 0x75, 0x3d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x31, 0x3d, 0x3d, - 0x3d, 0x69, 0x26, 0x26, 0x28, 0x74, 0x7c, 0x7c, 0x28, 0x5f, 0x3d, 0x5f, - 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, - 0x73, 0x2a, 0x5c, 0x6e, 0x5c, 0x73, 0x2a, 0x7c, 0x5c, 0x73, 0x2a, 0x5c, - 0x6e, 0x5c, 0x73, 0x2a, 0x24, 0x2f, 0x67, 0x2c, 0x22, 0x22, 0x29, 0x29, - 0x29, 0x3f, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x30, 0x2c, 0x74, - 0x2c, 0x5f, 0x29, 0x3a, 0x33, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x28, - 0x74, 0x7c, 0x7c, 0x5f, 0x29, 0x3f, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, - 0x68, 0x28, 0x33, 0x2c, 0x74, 0x2c, 0x5f, 0x29, 0x2c, 0x69, 0x3d, 0x32, - 0x29, 0x3a, 0x32, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x22, 0x2e, 0x2e, - 0x2e, 0x22, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x74, 0x3f, 0x72, 0x2e, - 0x70, 0x75, 0x73, 0x68, 0x28, 0x34, 0x2c, 0x74, 0x2c, 0x30, 0x29, 0x3a, - 0x32, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x5f, 0x26, 0x26, 0x21, 0x74, - 0x3f, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x35, 0x2c, 0x30, 0x2c, - 0x21, 0x30, 0x2c, 0x5f, 0x29, 0x3a, 0x69, 0x3e, 0x3d, 0x35, 0x26, 0x26, - 0x28, 0x28, 0x5f, 0x7c, 0x7c, 0x21, 0x74, 0x26, 0x26, 0x35, 0x3d, 0x3d, - 0x3d, 0x69, 0x29, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x69, 0x2c, 0x30, 0x2c, 0x5f, 0x2c, 0x65, 0x29, 0x2c, 0x69, 0x3d, - 0x36, 0x29, 0x2c, 0x74, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, - 0x68, 0x28, 0x69, 0x2c, 0x74, 0x2c, 0x30, 0x2c, 0x65, 0x29, 0x2c, 0x69, - 0x3d, 0x36, 0x29, 0x29, 0x2c, 0x5f, 0x3d, 0x22, 0x22, 0x7d, 0x2c, 0x6c, - 0x3d, 0x30, 0x3b, 0x6c, 0x3c, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x3b, 0x6c, 0x2b, 0x2b, 0x29, 0x7b, 0x6c, 0x26, 0x26, 0x28, 0x31, - 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x75, 0x28, 0x29, 0x2c, 0x75, 0x28, - 0x6c, 0x29, 0x29, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, - 0x66, 0x3d, 0x30, 0x3b, 0x66, 0x3c, 0x74, 0x5b, 0x6c, 0x5d, 0x2e, 0x6c, - 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x66, 0x2b, 0x2b, 0x29, 0x6e, 0x3d, - 0x74, 0x5b, 0x6c, 0x5d, 0x5b, 0x66, 0x5d, 0x2c, 0x31, 0x3d, 0x3d, 0x3d, - 0x69, 0x3f, 0x22, 0x3c, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x75, - 0x28, 0x29, 0x2c, 0x72, 0x3d, 0x5b, 0x72, 0x5d, 0x2c, 0x69, 0x3d, 0x33, - 0x29, 0x3a, 0x5f, 0x2b, 0x3d, 0x6e, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x69, - 0x3f, 0x22, 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x22, - 0x3e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x69, 0x3d, 0x31, 0x2c, - 0x5f, 0x3d, 0x22, 0x22, 0x29, 0x3a, 0x5f, 0x3d, 0x6e, 0x2b, 0x5f, 0x5b, - 0x30, 0x5d, 0x3a, 0x6f, 0x3f, 0x6e, 0x3d, 0x3d, 0x3d, 0x6f, 0x3f, 0x6f, - 0x3d, 0x22, 0x22, 0x3a, 0x5f, 0x2b, 0x3d, 0x6e, 0x3a, 0x27, 0x22, 0x27, - 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x27, 0x22, 0x3d, 0x3d, 0x3d, - 0x6e, 0x3f, 0x6f, 0x3d, 0x6e, 0x3a, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, - 0x6e, 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x69, 0x3d, 0x31, 0x29, 0x3a, - 0x69, 0x26, 0x26, 0x28, 0x22, 0x3d, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, - 0x28, 0x69, 0x3d, 0x35, 0x2c, 0x65, 0x3d, 0x5f, 0x2c, 0x5f, 0x3d, 0x22, - 0x22, 0x29, 0x3a, 0x22, 0x2f, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x26, 0x26, - 0x28, 0x69, 0x3c, 0x35, 0x7c, 0x7c, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, - 0x74, 0x5b, 0x6c, 0x5d, 0x5b, 0x66, 0x2b, 0x31, 0x5d, 0x29, 0x3f, 0x28, - 0x75, 0x28, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x28, - 0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x2c, 0x69, 0x3d, 0x72, 0x2c, - 0x28, 0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x2e, 0x70, 0x75, 0x73, - 0x68, 0x28, 0x32, 0x2c, 0x30, 0x2c, 0x69, 0x29, 0x2c, 0x69, 0x3d, 0x30, - 0x29, 0x3a, 0x22, 0x20, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, - 0x5c, 0x74, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x6e, - 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x72, 0x22, 0x3d, - 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x69, 0x3d, 0x32, - 0x29, 0x3a, 0x5f, 0x2b, 0x3d, 0x6e, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d, - 0x69, 0x26, 0x26, 0x22, 0x21, 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x5f, - 0x26, 0x26, 0x28, 0x69, 0x3d, 0x34, 0x2c, 0x72, 0x3d, 0x72, 0x5b, 0x30, - 0x5d, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x75, 0x28, - 0x29, 0x2c, 0x72, 0x7d, 0x28, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x29, 0x2c, - 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x5b, 0x5d, - 0x29, 0x29, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x31, 0x3f, - 0x6e, 0x3a, 0x6e, 0x5b, 0x30, 0x5d, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x6e, - 0x6e, 0x3d, 0x74, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x46, 0x29, - 0x3b, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x7b, 0x4c, 0x20, 0x61, 0x73, - 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x2c, 0x4f, - 0x20, 0x61, 0x73, 0x20, 0x46, 0x72, 0x61, 0x67, 0x6d, 0x65, 0x6e, 0x74, - 0x2c, 0x66, 0x20, 0x61, 0x73, 0x20, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, - 0x2c, 0x65, 0x20, 0x61, 0x73, 0x20, 0x62, 0x61, 0x74, 0x63, 0x68, 0x2c, - 0x66, 0x74, 0x20, 0x61, 0x73, 0x20, 0x63, 0x6c, 0x6f, 0x6e, 0x65, 0x45, - 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x64, 0x20, 0x61, 0x73, 0x20, - 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x2c, 0x73, 0x74, 0x20, - 0x61, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x43, 0x6f, 0x6e, - 0x74, 0x65, 0x78, 0x74, 0x2c, 0x46, 0x20, 0x61, 0x73, 0x20, 0x63, 0x72, - 0x65, 0x61, 0x74, 0x65, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, - 0x57, 0x20, 0x61, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52, - 0x65, 0x66, 0x2c, 0x62, 0x20, 0x61, 0x73, 0x20, 0x65, 0x66, 0x66, 0x65, - 0x63, 0x74, 0x2c, 0x46, 0x20, 0x61, 0x73, 0x20, 0x68, 0x2c, 0x6e, 0x6e, - 0x20, 0x61, 0x73, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x6c, 0x74, 0x20, - 0x61, 0x73, 0x20, 0x68, 0x79, 0x64, 0x72, 0x61, 0x74, 0x65, 0x2c, 0x77, - 0x20, 0x61, 0x73, 0x20, 0x69, 0x73, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x45, - 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x53, 0x20, 0x61, 0x73, 0x20, - 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2c, 0x75, 0x74, 0x20, 0x61, - 0x73, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x73, 0x20, 0x61, - 0x73, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x7a, 0x20, 0x61, - 0x73, 0x20, 0x74, 0x6f, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x41, 0x72, 0x72, - 0x61, 0x79, 0x2c, 0x44, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, - 0x43, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x2c, 0x51, 0x74, 0x20, - 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x75, 0x74, - 0x65, 0x64, 0x2c, 0x24, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, - 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x2c, 0x54, 0x74, 0x20, 0x61, - 0x73, 0x20, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67, 0x56, 0x61, - 0x6c, 0x75, 0x65, 0x2c, 0x45, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, - 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x56, 0x74, 0x20, 0x61, - 0x73, 0x20, 0x75, 0x73, 0x65, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x42, 0x6f, - 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x2c, 0x41, 0x74, 0x20, 0x61, 0x73, - 0x20, 0x75, 0x73, 0x65, 0x49, 0x64, 0x2c, 0x4e, 0x74, 0x20, 0x61, 0x73, - 0x20, 0x75, 0x73, 0x65, 0x49, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x69, - 0x76, 0x65, 0x48, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x55, 0x74, 0x20, - 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, - 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x50, 0x74, 0x20, 0x61, 0x73, - 0x20, 0x75, 0x73, 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x2c, 0x43, 0x74, 0x20, - 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x64, 0x75, 0x63, 0x65, - 0x72, 0x2c, 0x48, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x52, - 0x65, 0x66, 0x2c, 0x4b, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, - 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x58, 0x74, 0x20, 0x61, 0x73, - 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x45, 0x66, - 0x66, 0x65, 0x63, 0x74, 0x2c, 0x77, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, - 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x7d, 0x3b, 0x0a + 0x5a, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x6e, 0x3d, 0x50, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x63, 0x75, + 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74, 0x3b, 0x48, 0x74, 0x28, 0x28, + 0x29, 0x3d, 0x3e, 0x53, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x6e, 0x2e, 0x63, + 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x28, 0x29, 0x29, 0x2c, 0x5b, 0x5d, + 0x29, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x74, 0x6e, 0x3d, 0x66, 0x75, 0x6e, + 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, + 0x69, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x3b, 0x6e, 0x5b, 0x30, + 0x5d, 0x3d, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, + 0x6f, 0x3d, 0x31, 0x3b, 0x6f, 0x3c, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, + 0x74, 0x68, 0x3b, 0x6f, 0x2b, 0x2b, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, + 0x72, 0x3d, 0x6e, 0x5b, 0x6f, 0x2b, 0x2b, 0x5d, 0x2c, 0x75, 0x3d, 0x6e, + 0x5b, 0x6f, 0x5d, 0x3f, 0x28, 0x6e, 0x5b, 0x30, 0x5d, 0x7c, 0x3d, 0x72, + 0x3f, 0x31, 0x3a, 0x32, 0x2c, 0x65, 0x5b, 0x6e, 0x5b, 0x6f, 0x2b, 0x2b, + 0x5d, 0x5d, 0x29, 0x3a, 0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x3b, 0x33, + 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x69, 0x5b, 0x30, 0x5d, 0x3d, 0x75, 0x3a, + 0x34, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x69, 0x5b, 0x31, 0x5d, 0x3d, 0x4f, + 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x61, 0x73, 0x73, 0x69, 0x67, 0x6e, + 0x28, 0x69, 0x5b, 0x31, 0x5d, 0x7c, 0x7c, 0x7b, 0x7d, 0x2c, 0x75, 0x29, + 0x3a, 0x35, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x28, 0x69, 0x5b, 0x31, 0x5d, + 0x3d, 0x69, 0x5b, 0x31, 0x5d, 0x7c, 0x7c, 0x7b, 0x7d, 0x29, 0x5b, 0x6e, + 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x3d, 0x75, 0x3a, 0x36, 0x3d, 0x3d, + 0x3d, 0x72, 0x3f, 0x69, 0x5b, 0x31, 0x5d, 0x5b, 0x6e, 0x5b, 0x2b, 0x2b, + 0x6f, 0x5d, 0x5d, 0x2b, 0x3d, 0x75, 0x2b, 0x22, 0x22, 0x3a, 0x72, 0x3f, + 0x28, 0x5f, 0x3d, 0x74, 0x2e, 0x61, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x75, + 0x2c, 0x74, 0x6e, 0x28, 0x74, 0x2c, 0x75, 0x2c, 0x65, 0x2c, 0x5b, 0x22, + 0x22, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x5d, 0x29, 0x29, 0x2c, 0x69, 0x2e, + 0x70, 0x75, 0x73, 0x68, 0x28, 0x5f, 0x29, 0x2c, 0x75, 0x5b, 0x30, 0x5d, + 0x3f, 0x6e, 0x5b, 0x30, 0x5d, 0x7c, 0x3d, 0x32, 0x3a, 0x28, 0x6e, 0x5b, + 0x6f, 0x2d, 0x32, 0x5d, 0x3d, 0x30, 0x2c, 0x6e, 0x5b, 0x6f, 0x5d, 0x3d, + 0x5f, 0x29, 0x29, 0x3a, 0x69, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x75, + 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69, 0x7d, 0x2c, + 0x6e, 0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x3b, 0x66, + 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x65, 0x6e, 0x28, 0x74, + 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x6e, 0x6e, 0x2e, 0x67, + 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7c, 0x7c, 0x28, 0x6e, 0x3d, 0x6e, 0x65, + 0x77, 0x20, 0x4d, 0x61, 0x70, 0x2c, 0x6e, 0x6e, 0x2e, 0x73, 0x65, 0x74, + 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x6e, 0x29, 0x29, 0x2c, 0x28, 0x6e, + 0x3d, 0x74, 0x6e, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x6e, 0x2e, 0x67, + 0x65, 0x74, 0x28, 0x74, 0x29, 0x7c, 0x7c, 0x28, 0x6e, 0x2e, 0x73, 0x65, + 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, + 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, + 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x2c, 0x69, 0x3d, 0x31, 0x2c, 0x5f, 0x3d, + 0x22, 0x22, 0x2c, 0x6f, 0x3d, 0x22, 0x22, 0x2c, 0x72, 0x3d, 0x5b, 0x30, + 0x5d, 0x2c, 0x75, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, + 0x28, 0x74, 0x29, 0x7b, 0x31, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x28, + 0x74, 0x7c, 0x7c, 0x28, 0x5f, 0x3d, 0x5f, 0x2e, 0x72, 0x65, 0x70, 0x6c, + 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, 0x2a, 0x5c, 0x6e, 0x5c, + 0x73, 0x2a, 0x7c, 0x5c, 0x73, 0x2a, 0x5c, 0x6e, 0x5c, 0x73, 0x2a, 0x24, + 0x2f, 0x67, 0x2c, 0x22, 0x22, 0x29, 0x29, 0x29, 0x3f, 0x72, 0x2e, 0x70, + 0x75, 0x73, 0x68, 0x28, 0x30, 0x2c, 0x74, 0x2c, 0x5f, 0x29, 0x3a, 0x33, + 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x28, 0x74, 0x7c, 0x7c, 0x5f, 0x29, + 0x3f, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x33, 0x2c, 0x74, + 0x2c, 0x5f, 0x29, 0x2c, 0x69, 0x3d, 0x32, 0x29, 0x3a, 0x32, 0x3d, 0x3d, + 0x3d, 0x69, 0x26, 0x26, 0x22, 0x2e, 0x2e, 0x2e, 0x22, 0x3d, 0x3d, 0x3d, + 0x5f, 0x26, 0x26, 0x74, 0x3f, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, + 0x34, 0x2c, 0x74, 0x2c, 0x30, 0x29, 0x3a, 0x32, 0x3d, 0x3d, 0x3d, 0x69, + 0x26, 0x26, 0x5f, 0x26, 0x26, 0x21, 0x74, 0x3f, 0x72, 0x2e, 0x70, 0x75, + 0x73, 0x68, 0x28, 0x35, 0x2c, 0x30, 0x2c, 0x21, 0x30, 0x2c, 0x5f, 0x29, + 0x3a, 0x69, 0x3e, 0x3d, 0x35, 0x26, 0x26, 0x28, 0x28, 0x5f, 0x7c, 0x7c, + 0x21, 0x74, 0x26, 0x26, 0x35, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x26, 0x26, + 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x69, 0x2c, 0x30, 0x2c, + 0x5f, 0x2c, 0x65, 0x29, 0x2c, 0x69, 0x3d, 0x36, 0x29, 0x2c, 0x74, 0x26, + 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x69, 0x2c, 0x74, + 0x2c, 0x30, 0x2c, 0x65, 0x29, 0x2c, 0x69, 0x3d, 0x36, 0x29, 0x29, 0x2c, + 0x5f, 0x3d, 0x22, 0x22, 0x7d, 0x2c, 0x66, 0x3d, 0x30, 0x3b, 0x66, 0x3c, + 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x66, 0x2b, 0x2b, + 0x29, 0x7b, 0x66, 0x26, 0x26, 0x28, 0x31, 0x3d, 0x3d, 0x3d, 0x69, 0x26, + 0x26, 0x75, 0x28, 0x29, 0x2c, 0x75, 0x28, 0x66, 0x29, 0x29, 0x3b, 0x66, + 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x6c, 0x3d, 0x30, 0x3b, 0x6c, + 0x3c, 0x74, 0x5b, 0x66, 0x5d, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, + 0x3b, 0x6c, 0x2b, 0x2b, 0x29, 0x6e, 0x3d, 0x74, 0x5b, 0x66, 0x5d, 0x5b, + 0x6c, 0x5d, 0x2c, 0x31, 0x3d, 0x3d, 0x3d, 0x69, 0x3f, 0x22, 0x3c, 0x22, + 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x72, 0x3d, + 0x5b, 0x72, 0x5d, 0x2c, 0x69, 0x3d, 0x33, 0x29, 0x3a, 0x5f, 0x2b, 0x3d, + 0x6e, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x69, 0x3f, 0x22, 0x2d, 0x2d, 0x22, + 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, + 0x6e, 0x3f, 0x28, 0x69, 0x3d, 0x31, 0x2c, 0x5f, 0x3d, 0x22, 0x22, 0x29, + 0x3a, 0x5f, 0x3d, 0x6e, 0x2b, 0x5f, 0x5b, 0x30, 0x5d, 0x3a, 0x6f, 0x3f, + 0x6e, 0x3d, 0x3d, 0x3d, 0x6f, 0x3f, 0x6f, 0x3d, 0x22, 0x22, 0x3a, 0x5f, + 0x2b, 0x3d, 0x6e, 0x3a, 0x27, 0x22, 0x27, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, + 0x7c, 0x22, 0x27, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x6f, 0x3d, 0x6e, + 0x3a, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x75, 0x28, + 0x29, 0x2c, 0x69, 0x3d, 0x31, 0x29, 0x3a, 0x69, 0x26, 0x26, 0x28, 0x22, + 0x3d, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x69, 0x3d, 0x35, 0x2c, + 0x65, 0x3d, 0x5f, 0x2c, 0x5f, 0x3d, 0x22, 0x22, 0x29, 0x3a, 0x22, 0x2f, + 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x28, 0x69, 0x3c, 0x35, 0x7c, + 0x7c, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, 0x74, 0x5b, 0x66, 0x5d, 0x5b, + 0x6c, 0x2b, 0x31, 0x5d, 0x29, 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x33, + 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x28, 0x72, 0x3d, 0x72, 0x5b, 0x30, + 0x5d, 0x29, 0x2c, 0x69, 0x3d, 0x72, 0x2c, 0x28, 0x72, 0x3d, 0x72, 0x5b, + 0x30, 0x5d, 0x29, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x32, 0x2c, 0x30, + 0x2c, 0x69, 0x29, 0x2c, 0x69, 0x3d, 0x30, 0x29, 0x3a, 0x22, 0x20, 0x22, + 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x74, 0x22, 0x3d, 0x3d, + 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, + 0x7c, 0x7c, 0x22, 0x5c, 0x72, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, + 0x75, 0x28, 0x29, 0x2c, 0x69, 0x3d, 0x32, 0x29, 0x3a, 0x5f, 0x2b, 0x3d, + 0x6e, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x22, 0x21, + 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x28, 0x69, 0x3d, + 0x34, 0x2c, 0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x7d, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x75, 0x28, 0x29, 0x2c, 0x72, 0x7d, 0x28, + 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x29, 0x2c, 0x61, 0x72, 0x67, 0x75, 0x6d, + 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x5b, 0x5d, 0x29, 0x29, 0x2e, 0x6c, 0x65, + 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x31, 0x3f, 0x6e, 0x3a, 0x6e, 0x5b, 0x30, + 0x5d, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x6e, 0x3d, 0x65, 0x6e, 0x2e, + 0x62, 0x69, 0x6e, 0x64, 0x28, 0x57, 0x29, 0x3b, 0x65, 0x78, 0x70, 0x6f, + 0x72, 0x74, 0x7b, 0x49, 0x20, 0x61, 0x73, 0x20, 0x43, 0x6f, 0x6d, 0x70, + 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x2c, 0x52, 0x20, 0x61, 0x73, 0x20, 0x46, + 0x72, 0x61, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x63, 0x20, 0x61, 0x73, + 0x20, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x65, 0x20, 0x61, 0x73, + 0x20, 0x62, 0x61, 0x74, 0x63, 0x68, 0x2c, 0x63, 0x74, 0x20, 0x61, 0x73, + 0x20, 0x63, 0x6c, 0x6f, 0x6e, 0x65, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, + 0x74, 0x2c, 0x79, 0x20, 0x61, 0x73, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, + 0x74, 0x65, 0x64, 0x2c, 0x68, 0x74, 0x20, 0x61, 0x73, 0x20, 0x63, 0x72, + 0x65, 0x61, 0x74, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x2c, + 0x57, 0x20, 0x61, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x45, + 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x4c, 0x20, 0x61, 0x73, 0x20, + 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52, 0x65, 0x66, 0x2c, 0x53, 0x20, + 0x61, 0x73, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x57, 0x20, + 0x61, 0x73, 0x20, 0x68, 0x2c, 0x5f, 0x6e, 0x20, 0x61, 0x73, 0x20, 0x68, + 0x74, 0x6d, 0x6c, 0x2c, 0x73, 0x74, 0x20, 0x61, 0x73, 0x20, 0x68, 0x79, + 0x64, 0x72, 0x61, 0x74, 0x65, 0x2c, 0x45, 0x20, 0x61, 0x73, 0x20, 0x69, + 0x73, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, + 0x74, 0x2c, 0x77, 0x20, 0x61, 0x73, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, + 0x6e, 0x73, 0x2c, 0x6c, 0x74, 0x20, 0x61, 0x73, 0x20, 0x72, 0x65, 0x6e, + 0x64, 0x65, 0x72, 0x2c, 0x68, 0x20, 0x61, 0x73, 0x20, 0x73, 0x69, 0x67, + 0x6e, 0x61, 0x6c, 0x2c, 0x4b, 0x20, 0x61, 0x73, 0x20, 0x74, 0x6f, 0x43, + 0x68, 0x69, 0x6c, 0x64, 0x41, 0x72, 0x72, 0x61, 0x79, 0x2c, 0x72, 0x20, + 0x61, 0x73, 0x20, 0x75, 0x6e, 0x74, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x64, + 0x2c, 0x54, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x43, 0x61, + 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x2c, 0x59, 0x74, 0x20, 0x61, 0x73, + 0x20, 0x75, 0x73, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, + 0x2c, 0x56, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x43, 0x6f, + 0x6e, 0x74, 0x65, 0x78, 0x74, 0x2c, 0x41, 0x74, 0x20, 0x61, 0x73, 0x20, + 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67, 0x56, 0x61, 0x6c, 0x75, + 0x65, 0x2c, 0x48, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x45, + 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x46, 0x74, 0x20, 0x61, 0x73, 0x20, + 0x75, 0x73, 0x65, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x42, 0x6f, 0x75, 0x6e, + 0x64, 0x61, 0x72, 0x79, 0x2c, 0x4d, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, + 0x73, 0x65, 0x49, 0x64, 0x2c, 0x24, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, + 0x73, 0x65, 0x49, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x69, 0x76, 0x65, + 0x48, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x4e, 0x74, 0x20, 0x61, 0x73, + 0x20, 0x75, 0x73, 0x65, 0x4c, 0x61, 0x79, 0x6f, 0x75, 0x74, 0x45, 0x66, + 0x66, 0x65, 0x63, 0x74, 0x2c, 0x44, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, + 0x73, 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x2c, 0x55, 0x74, 0x20, 0x61, 0x73, + 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x64, 0x75, 0x63, 0x65, 0x72, 0x2c, + 0x50, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, + 0x2c, 0x58, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, + 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x5a, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, + 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x45, 0x66, 0x66, 0x65, + 0x63, 0x74, 0x2c, 0x45, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, + 0x53, 0x74, 0x61, 0x74, 0x65, 0x7d, 0x3b, 0x0a }; -unsigned int index_js_len = 22174; +unsigned int index_js_len = 22472; diff --git a/examples/server/json-schema-to-grammar.mjs.hpp b/examples/server/json-schema-to-grammar.mjs.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0a05c369d77ba58cbd7f85324efe312df5f51916 --- /dev/null +++ b/examples/server/json-schema-to-grammar.mjs.hpp @@ -0,0 +1,311 @@ +unsigned char json_schema_to_grammar_mjs[] = { + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, + 0x52, 0x55, 0x4c, 0x45, 0x20, 0x3d, 0x20, 0x27, 0x22, 0x20, 0x22, 0x3f, + 0x27, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x52, + 0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, + 0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x62, 0x6f, 0x6f, 0x6c, + 0x65, 0x61, 0x6e, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x74, 0x72, 0x75, 0x65, + 0x22, 0x20, 0x7c, 0x20, 0x22, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x22, 0x29, + 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x6e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, 0x22, + 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x5b, + 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, 0x29, + 0x29, 0x20, 0x28, 0x22, 0x2e, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, + 0x2b, 0x29, 0x3f, 0x20, 0x28, 0x5b, 0x65, 0x45, 0x5d, 0x20, 0x5b, 0x2d, + 0x2b, 0x5d, 0x3f, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2b, 0x29, 0x3f, + 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x69, + 0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, + 0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, + 0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, + 0x29, 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, + 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x60, 0x20, 0x22, + 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x5b, 0x5e, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x5d, 0x20, + 0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x22, 0x5c, + 0x5c, 0x5c, 0x5c, 0x22, 0x20, 0x28, 0x5b, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, + 0x2f, 0x62, 0x66, 0x6e, 0x72, 0x74, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x75, + 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, + 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, + 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, + 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, + 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2a, 0x20, + 0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60, + 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x20, 0x27, 0x22, + 0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, + 0x2c, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45, + 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20, + 0x2f, 0x5b, 0x5e, 0x5c, 0x64, 0x41, 0x2d, 0x5a, 0x61, 0x2d, 0x7a, 0x2d, + 0x5d, 0x2b, 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, + 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52, + 0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5d, + 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52, + 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, + 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x53, 0x20, 0x3d, 0x20, + 0x7b, 0x27, 0x5c, 0x72, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x72, 0x27, + 0x2c, 0x20, 0x27, 0x5c, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x6e, + 0x27, 0x2c, 0x20, 0x27, 0x22, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x22, + 0x27, 0x7d, 0x3b, 0x0a, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, + 0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, + 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, + 0x72, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x5f, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d, + 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x7c, + 0x7c, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x20, 0x3d, 0x20, + 0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, + 0x65, 0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x27, 0x73, 0x70, 0x61, 0x63, + 0x65, 0x27, 0x2c, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, 0x52, 0x55, + 0x4c, 0x45, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72, + 0x61, 0x6c, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x4a, 0x53, + 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, + 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x2e, 0x72, 0x65, + 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, + 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, + 0x52, 0x45, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20, + 0x3d, 0x3e, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, + 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, + 0x45, 0x53, 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x60, 0x22, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x7d, + 0x22, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, + 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65, + 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, + 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2e, 0x72, 0x65, 0x70, + 0x6c, 0x61, 0x63, 0x65, 0x28, 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, + 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, + 0x52, 0x45, 0x2c, 0x20, 0x27, 0x2d, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, + 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, + 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x65, 0x73, + 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x28, + 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x3d, + 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, + 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, + 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28, + 0x60, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, + 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b, + 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, 0x7d, + 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, + 0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x6b, 0x65, 0x79, 0x2c, 0x20, 0x72, + 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, + 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, + 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, + 0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x74, 0x79, 0x70, + 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, + 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x72, 0x6f, 0x6f, + 0x74, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, + 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f, + 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, + 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, + 0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, + 0x6f, 0x6e, 0x65, 0x4f, 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, + 0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x2e, 0x6d, + 0x61, 0x70, 0x28, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, + 0x73, 0x69, 0x74, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, + 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, + 0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, + 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, + 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, + 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72, + 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, + 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, + 0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, + 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72, + 0x61, 0x6c, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x65, + 0x6e, 0x75, 0x6d, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, + 0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d, + 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x65, 0x6e, 0x75, 0x6d, + 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x68, + 0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, + 0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x76, 0x29, 0x29, 0x2e, 0x6a, 0x6f, + 0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, + 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, + 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, + 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x63, + 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, + 0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x20, 0x26, 0x26, + 0x20, 0x27, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, + 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, + 0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x60, 0x72, 0x65, 0x71, 0x75, 0x69, + 0x72, 0x65, 0x64, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64, + 0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f, + 0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, + 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, + 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, + 0x72, 0x6f, 0x70, 0x50, 0x61, 0x69, 0x72, 0x73, 0x20, 0x3d, 0x20, 0x4f, + 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65, + 0x73, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x6f, + 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, 0x2e, 0x73, 0x6f, 0x72, + 0x74, 0x28, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, + 0x20, 0x73, 0x6f, 0x72, 0x74, 0x20, 0x62, 0x79, 0x20, 0x70, 0x6f, 0x73, + 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f, + 0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x28, 0x69, 0x66, 0x20, + 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x64, 0x29, 0x20, 0x74, + 0x68, 0x65, 0x6e, 0x20, 0x62, 0x79, 0x20, 0x6b, 0x65, 0x79, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x74, 0x79, + 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, + 0x65, 0x72, 0x5b, 0x61, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3d, 0x3d, 0x3d, + 0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x27, 0x20, 0x3f, 0x20, + 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x61, 0x5b, + 0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69, + 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x42, + 0x20, 0x3d, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72, + 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d, + 0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x27, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, + 0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49, + 0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, + 0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x2d, 0x20, 0x6f, 0x72, 0x64, + 0x65, 0x72, 0x42, 0x20, 0x7c, 0x7c, 0x20, 0x61, 0x5b, 0x30, 0x5d, 0x2e, + 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x61, 0x72, + 0x65, 0x28, 0x62, 0x5b, 0x30, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d, + 0x20, 0x27, 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70, + 0x50, 0x61, 0x69, 0x72, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, + 0x68, 0x28, 0x28, 0x5b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, + 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, + 0x5d, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, + 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73, + 0x69, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, + 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, + 0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x4e, + 0x61, 0x6d, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3e, 0x20, + 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27, + 0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, + 0x2b, 0x3d, 0x20, 0x60, 0x20, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, + 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72, + 0x61, 0x6c, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x29, + 0x7d, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x22, 0x3a, 0x22, 0x20, + 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, + 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x60, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, + 0x27, 0x20, 0x22, 0x7d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, + 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, + 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, + 0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, + 0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, + 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27, + 0x20, 0x26, 0x26, 0x20, 0x27, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20, + 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x4f, + 0x44, 0x4f, 0x20, 0x60, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x49, 0x74, + 0x65, 0x6d, 0x73, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64, + 0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f, + 0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, + 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, + 0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65, + 0x6d, 0x61, 0x2e, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2c, 0x20, 0x60, 0x24, + 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, + 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x22, 0x7d, + 0x69, 0x74, 0x65, 0x6d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, + 0x20, 0x3d, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, + 0x65, 0x20, 0x28, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c, + 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x28, 0x22, 0x2c, 0x22, 0x20, + 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, + 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x29, 0x2a, 0x29, + 0x3f, 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, + 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, + 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x50, 0x52, + 0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, + 0x53, 0x5b, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, + 0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, + 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x72, 0x65, 0x63, 0x6f, + 0x67, 0x6e, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x3a, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74, + 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x73, 0x63, 0x68, 0x65, + 0x6d, 0x61, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, + 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, + 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, + 0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3a, 0x20, 0x73, + 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x2c, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49, + 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x73, + 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x5d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, + 0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67, + 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, + 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, + 0x28, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, + 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x2b, 0x3d, 0x20, + 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x3a, 0x3a, 0x3d, + 0x20, 0x24, 0x7b, 0x72, 0x75, 0x6c, 0x65, 0x7d, 0x5c, 0x6e, 0x60, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x72, 0x61, 0x6d, + 0x6d, 0x61, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a +}; +unsigned int json_schema_to_grammar_mjs_len = 3695; diff --git a/examples/server/public/index.html b/examples/server/public/index.html index de41da187a40f4413738e3cf7dc93f591b71546d..1bf2a8b3a0a0358f82eb19dad8f85e2139bf8f11 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -102,6 +102,17 @@ padding: 0.5em; } + .prob-set { + padding: 0.3em; + border-bottom: 1px solid #ccc; + } + + .popover-content { + position: absolute; + background-color: white; + padding: 0.2em; + box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); + } textarea { padding: 5px; @@ -133,22 +144,51 @@ font-size: 80%; color: #888; } + + + @keyframes loading-bg-wipe { + 0% { + background-position: 0%; + } + 100% { + background-position: 100%; + } + } + + .loading { + --loading-color-1: #eeeeee00; + --loading-color-2: #eeeeeeff; + background-size: 50% 100%; + background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1)); + animation: loading-bg-wipe 2s linear infinite; + } + + @media (prefers-color-scheme: dark) { + .loading { + --loading-color-1: #22222200; + --loading-color-2: #222222ff; + } + .popover-content { + background-color: black; + } + } +
+
diff --git a/examples/server/public/index.js b/examples/server/public/index.js index 4fa725a90fac42decad924a7b397091b3c511074..9db5a9d9faf299fe8f19467b6a3bd068e4f56677 100644 --- a/examples/server/public/index.js +++ b/examples/server/public/index.js @@ -1 +1 @@ -function t(){throw new Error("Cycle detected")}function n(){if(o>1){o--;return}let t,n=!1;while(void 0!==_){let i=_;_=void 0;r++;while(void 0!==i){const _=i.o;i.o=void 0;i.f&=-3;if(!(8&i.f)&&c(i))try{i.c()}catch(e){if(!n){t=e;n=!0}}i=_}}r=0;o--;if(n)throw t}function e(t){if(o>0)return t();o++;try{return t()}finally{n()}}let i,_,o=0,r=0,u=0;function l(t){if(void 0===i)return;let n=t.n;if(void 0===n||n.t!==i){n={i:0,S:t,p:i.s,n:void 0,t:i,e:void 0,x:void 0,r:n};if(void 0!==i.s)i.s.n=n;i.s=n;t.n=n;if(32&i.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=i.s;n.n=void 0;i.s.n=n;i.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){const n=this;return b((function(){const e=n.value,i=32&this.f;this.f&=-33;try{t(e)}finally{this.f|=i}}))};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){return this.v};Object.defineProperty(f.prototype,"value",{get(){const t=l(this);if(void 0!==t)t.i=this.i;return this.v},set(e){if(i instanceof p)!function(){throw new Error("Computed cannot have side-effects")}();if(e!==this.v){if(r>100)t();this.v=e;this.i++;u++;o++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function s(t){return new f(t)}function c(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function h(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function a(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function p(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=u-1;this.f=4}(p.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===u)return!0;this.g=u;this.f|=1;if(this.i>0&&!c(this)){this.f&=-2;return!0}const t=i;try{h(this);i=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}i=t;a(this);this.f&=-2;return!0};p.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};p.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};p.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};p.prototype.peek=function(){if(!this.h())t();if(16&this.f)throw this.v;return this.v};Object.defineProperty(p.prototype,"value",{get(){if(1&this.f)t();const n=l(this);this.h();if(void 0!==n)n.i=this.i;if(16&this.f)throw this.v;return this.v}});function d(t){return new p(t)}function v(t){const e=t.u;t.u=void 0;if("function"==typeof e){o++;const _=i;i=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;y(t);throw n}finally{i=_;n()}}}function y(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;v(t)}function m(t){if(i!==this)throw new Error("Out-of-order effect");a(this);i=t;this.f&=-2;if(8&this.f)y(this);n()}function g(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}g.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};g.prototype.S=function(){if(1&this.f)t();this.f|=1;this.f&=-9;v(this);h(this);o++;const n=i;i=this;return m.bind(this,n)};g.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=_;_=this}};g.prototype.d=function(){this.f|=8;if(!(1&this.f))y(this)};function b(t){const n=new g(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var k,S,x,w,C,E,U,H,N,P={},D=[],$=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,T=Array.isArray;function V(t,n){for(var e in n)t[e]=n[e];return t}function A(t){var n=t.parentNode;n&&n.removeChild(t)}function F(t,n,e){var i,_,o,r={};for(o in n)"key"==o?i=n[o]:"ref"==o?_=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?k.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return M(t,r,i,_,null)}function M(t,n,e,i,_){var o={type:t,props:n,key:e,ref:i,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,__h:null,constructor:void 0,__v:null==_?++x:_};return null==_&&null!=S.vnode&&S.vnode(o),o}function W(){return{current:null}}function O(t){return t.children}function L(t,n){this.props=t,this.context=n}function R(t,n){if(null==n)return t.__?R(t.__,t.__.__k.indexOf(t)+1):null;for(var e;nn&&C.sort(H));q.__r=0}function B(t,n,e,i,_,o,r,u,l,f){var s,c,h,a,p,d,v,y=i&&i.__k||D,m=y.length;for(e.__k=[],s=0;s0?M(a.type,a.props,a.key,a.ref?a.ref:null,a.__v):a)){if(a.__=e,a.__b=e.__b+1,null===(h=y[s])||h&&a.key==h.key&&a.type===h.type)y[s]=void 0;else for(c=0;c=0;n--)if((e=t.__k[n])&&(i=K(e)))return i;return null}function Q(t,n,e,i,_){var o;for(o in e)"children"===o||"key"===o||o in n||Y(t,o,null,e[o],i);for(o in n)_&&"function"!=typeof n[o]||"children"===o||"key"===o||"value"===o||"checked"===o||e[o]===n[o]||Y(t,o,n[o],e[o],i)}function X(t,n,e){"-"===n[0]?t.setProperty(n,null==e?"":e):t[n]=null==e?"":"number"!=typeof e||$.test(n)?e:e+"px"}function Y(t,n,e,i,_){var o;t:if("style"===n)if("string"==typeof e)t.style.cssText=e;else{if("string"==typeof i&&(t.style.cssText=i=""),i)for(n in i)e&&n in e||X(t.style,n,"");if(e)for(n in e)i&&e[n]===i[n]||X(t.style,n,e[n])}else if("o"===n[0]&&"n"===n[1])o=n!==(n=n.replace(/Capture$/,"")),n=n.toLowerCase()in t?n.toLowerCase().slice(2):n.slice(2),t.l||(t.l={}),t.l[n+o]=e,e?i||t.addEventListener(n,o?tt:Z,o):t.removeEventListener(n,o?tt:Z,o);else if("dangerouslySetInnerHTML"!==n){if(_)n=n.replace(/xlink(H|:h)/,"h").replace(/sName$/,"s");else if("width"!==n&&"height"!==n&&"href"!==n&&"list"!==n&&"form"!==n&&"tabIndex"!==n&&"download"!==n&&"rowSpan"!==n&&"colSpan"!==n&&n in t)try{t[n]=null==e?"":e;break t}catch(t){}"function"==typeof e||(null==e||!1===e&&"-"!==n[4]?t.removeAttribute(n):t.setAttribute(n,e))}}function Z(t){return this.l[t.type+!1](S.event?S.event(t):t)}function tt(t){return this.l[t.type+!0](S.event?S.event(t):t)}function nt(t,n,e,i,_,o,r,u,l){var f,s,c,h,a,p,d,v,y,m,g,b,k,x,w,C=n.type;if(void 0!==n.constructor)return null;null!=e.__h&&(l=e.__h,u=n.__e=e.__e,n.__h=null,o=[u]),(f=S.__b)&&f(n);try{t:if("function"==typeof C){if(v=n.props,y=(f=C.contextType)&&i[f.__c],m=f?y?y.props.value:f.__:i,e.__c?d=(s=n.__c=e.__c).__=s.__E:("prototype"in C&&C.prototype.render?n.__c=s=new C(v,m):(n.__c=s=new L(v,m),s.constructor=C,s.render=rt),y&&y.sub(s),s.props=v,s.state||(s.state={}),s.context=m,s.__n=i,c=s.__d=!0,s.__h=[],s._sb=[]),null==s.__s&&(s.__s=s.state),null!=C.getDerivedStateFromProps&&(s.__s==s.state&&(s.__s=V({},s.__s)),V(s.__s,C.getDerivedStateFromProps(v,s.__s))),h=s.props,a=s.state,s.__v=n,c)null==C.getDerivedStateFromProps&&null!=s.componentWillMount&&s.componentWillMount(),null!=s.componentDidMount&&s.__h.push(s.componentDidMount);else{if(null==C.getDerivedStateFromProps&&v!==h&&null!=s.componentWillReceiveProps&&s.componentWillReceiveProps(v,m),!s.__e&&null!=s.shouldComponentUpdate&&!1===s.shouldComponentUpdate(v,s.__s,m)||n.__v===e.__v){for(n.__v!==e.__v&&(s.props=v,s.state=s.__s,s.__d=!1),s.__e=!1,n.__e=e.__e,n.__k=e.__k,n.__k.forEach((function(t){t&&(t.__=n)})),g=0;g2&&(u.children=arguments.length>3?k.call(arguments,2):e),M(t.type,u,i||t.key,_||t.ref,null)}function st(t,n){var e={__c:n="__cC"+N++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,i;return this.getChildContext||(e=[],(i={})[n]=this,this.getChildContext=function(){return i},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,j(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}k=D.slice,S={__e:function(t,n,e,i){for(var _,o,r;n=n.__;)if((_=n.__c)&&!_.__)try{if((o=_.constructor)&&null!=o.getDerivedStateFromError&&(_.setState(o.getDerivedStateFromError(t)),r=_.__d),null!=_.componentDidCatch&&(_.componentDidCatch(t,i||{}),r=_.__d),r)return _.__E=_}catch(n){t=n}throw t}},x=0,w=function(t){return null!=t&&void 0===t.constructor},L.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=V({},this.state),"function"==typeof t&&(t=t(V({},e),this.props)),t&&V(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),j(this))},L.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),j(this))},L.prototype.render=O,C=[],U="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,H=function(t,n){return t.__v.__b-n.__v.__b},q.__r=0,N=0;var ct,ht,at,pt,dt=0,vt=[],yt=[],mt=S.__b,gt=S.__r,bt=S.diffed,kt=S.__c,St=S.unmount;function xt(t,n){S.__h&&S.__h(ht,t,dt||n),dt=0;var e=ht.__H||(ht.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({__V:yt}),e.__[t]}function wt(t){return dt=1,Ct(It,t)}function Ct(t,n,e){var i=xt(ct++,2);if(i.t=t,!i.__c&&(i.__=[e?e(n):It(void 0,n),function(t){var n=i.__N?i.__N[0]:i.__[0],e=i.t(n,t);n!==e&&(i.__N=[e,i.__[1]],i.__c.setState({}))}],i.__c=ht,!ht.u)){var _=function(t,n,e){if(!i.__c.__H)return!0;var _=i.__c.__H.__.filter((function(t){return t.__c}));if(_.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return _.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&i.__c.props===t)&&(!o||o.call(this,t,n,e))};ht.u=!0;var o=ht.shouldComponentUpdate,r=ht.componentWillUpdate;ht.componentWillUpdate=function(t,n,e){if(this.__e){var i=o;o=void 0,_(t,n,e),o=i}r&&r.call(this,t,n,e)},ht.shouldComponentUpdate=_}return i.__N||i.__}function Et(t,n){var e=xt(ct++,3);!S.__s&&Rt(e.__H,n)&&(e.__=t,e.i=n,ht.__H.__h.push(e))}function Ut(t,n){var e=xt(ct++,4);!S.__s&&Rt(e.__H,n)&&(e.__=t,e.i=n,ht.__h.push(e))}function Ht(t){return dt=5,Pt((function(){return{current:t}}),[])}function Nt(t,n,e){dt=6,Ut((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Pt(t,n){var e=xt(ct++,7);return Rt(e.__H,n)?(e.__V=t(),e.i=n,e.__h=t,e.__V):e.__}function Dt(t,n){return dt=8,Pt((function(){return t}),n)}function $t(t){var n=ht.context[t.__c],e=xt(ct++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(ht)),n.props.value):t.__}function Tt(t,n){S.useDebugValue&&S.useDebugValue(n?n(t):t)}function Vt(t){var n=xt(ct++,10),e=wt();return n.__=t,ht.componentDidCatch||(ht.componentDidCatch=function(t,i){n.__&&n.__(t,i),e[1](t)}),[e[0],function(){e[1](void 0)}]}function At(){var t=xt(ct++,11);if(!t.__){for(var n=ht.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ft(){for(var t;t=vt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Ot),t.__H.__h.forEach(Lt),t.__H.__h=[]}catch(u){t.__H.__h=[],S.__e(u,t.__v)}}S.__b=function(t){ht=null,mt&&mt(t)},S.__r=function(t){gt&>(t),ct=0;var n=(ht=t.__c).__H;n&&(at===ht?(n.__h=[],ht.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.__V=yt,t.__N=t.i=void 0}))):(n.__h.forEach(Ot),n.__h.forEach(Lt),n.__h=[],ct=0)),at=ht},S.diffed=function(t){bt&&bt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==vt.push(n)&&pt===S.requestAnimationFrame||((pt=S.requestAnimationFrame)||Wt)(Ft)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.__V!==yt&&(t.__=t.__V),t.i=void 0,t.__V=yt}))),at=ht=null},S.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Ot),t.__h=t.__h.filter((function(t){return!t.__||Lt(t)}))}catch(s){n.some((function(t){t.__h&&(t.__h=[])})),n=[],S.__e(s,t.__v)}})),kt&&kt(t,n)},S.unmount=function(t){St&&St(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Ot(t)}catch(t){n=t}})),e.__H=void 0,n&&S.__e(n,e.__v))};var Mt="function"==typeof requestAnimationFrame;function Wt(t){var n,e=function(){clearTimeout(i),Mt&&cancelAnimationFrame(n),setTimeout(t)},i=setTimeout(e,100);Mt&&(n=requestAnimationFrame(e))}function Ot(t){var n=ht,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),ht=n}function Lt(t){var n=ht;t.__c=t.__(),ht=n}function Rt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function It(t,n){return"function"==typeof n?n(t):n}function jt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let qt,Bt;function Gt(t){if(Bt)Bt();Bt=t&&t.S()}function zt({data:t}){const n=Kt(t);n.value=t;const e=Pt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{this.base.data=e.peek()};return d(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}zt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:zt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});jt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let i in e){if("children"===i)continue;let _=e[i];if(_ instanceof f){if(!t)n.__np=t={};t[i]=_;e[i]=_.peek()}}}t(n)});jt("__r",(t,n)=>{Gt();let e,i=n.__c;if(i){i.__$f&=-2;e=i.__$u;if(void 0===e)i.__$u=e=function(t){let n;b((function(){n=this}));n.c=()=>{i.__$f|=1;i.setState({})};return n}()}qt=i;Gt(e);t(n)});jt("__e",(t,n,e,i)=>{Gt();qt=void 0;t(n,e,i)});jt("diffed",(t,n)=>{Gt();qt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,i=n.props;if(t){let n=e.U;if(n)for(let e in n){let i=n[e];if(void 0!==i&&!(e in t)){i.d();n[e]=void 0}}else{n={};e.U=n}for(let _ in t){let o=n[_],r=t[_];if(void 0===o){o=Jt(e,_,r,i);n[_]=o}else o.o(r,i)}}}t(n)});function Jt(t,n,e,i){const _=n in t&&void 0===t.ownerSVGElement,o=s(e);return{o:(t,n)=>{o.value=t;i=n},d:b(()=>{const e=o.value.value;if(i[n]!==e){i[n]=e;if(_)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}jt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});jt("__h",(t,n,e,i)=>{if(i<3)n.__$f|=2;t(n,e,i)});L.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let i in n)return!0;for(let i in t)if("__source"!==i&&t[i]!==this.props[i])return!0;for(let i in this.props)if(!(i in t))return!0;return!1};function Kt(t){return Pt(()=>s(t),[])}function Qt(t){const n=Ht(t);n.current=t;qt.__$f|=4;return Pt(()=>d(()=>n.current()),[])}function Xt(t){const n=Ht(t);n.current=t;Et(()=>b(()=>n.current()),[])}var Yt=function(t,n,e,i){var _;n[0]=0;for(var o=1;o=5&&((_||!t&&5===i)&&(r.push(i,0,_,e),i=6),t&&(r.push(i,t,0,e),i=6)),_=""},l=0;l"===n?(i=1,_=""):_=n+_[0]:o?n===o?o="":_+=n:'"'===n||"'"===n?o=n:">"===n?(u(),i=1):i&&("="===n?(i=5,e=_,_=""):"/"===n&&(i<5||">"===t[l][f+1])?(u(),3===i&&(r=r[0]),i=r,(r=r[0]).push(2,0,i),i=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),i=2):_+=n),3===i&&"!--"===_&&(i=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var nn=tn.bind(F);export{L as Component,O as Fragment,f as Signal,e as batch,ft as cloneElement,d as computed,st as createContext,F as createElement,W as createRef,b as effect,F as h,nn as html,lt as hydrate,w as isValidElement,S as options,ut as render,s as signal,z as toChildArray,Dt as useCallback,Qt as useComputed,$t as useContext,Tt as useDebugValue,Et as useEffect,Vt as useErrorBoundary,At as useId,Nt as useImperativeHandle,Ut as useLayoutEffect,Pt as useMemo,Ct as useReducer,Ht as useRef,Kt as useSignal,Xt as useSignalEffect,wt as useState}; +function t(){throw new Error("Cycle detected")}function n(){if(u>1){u--;return}let t,n=!1;while(void 0!==_){let i=_;_=void 0;f++;while(void 0!==i){const _=i.o;i.o=void 0;i.f&=-3;if(!(8&i.f)&&a(i))try{i.c()}catch(e){if(!n){t=e;n=!0}}i=_}}f=0;u--;if(n)throw t}function e(t){if(u>0)return t();u++;try{return t()}finally{n()}}let i,_,o=0;function r(t){if(o>0)return t();const n=i;i=void 0;o++;try{return t()}finally{o--;i=n}}let u=0,f=0,l=0;function s(t){if(void 0===i)return;let n=t.n;if(void 0===n||n.t!==i){n={i:0,S:t,p:i.s,n:void 0,t:i,e:void 0,x:void 0,r:n};if(void 0!==i.s)i.s.n=n;i.s=n;t.n=n;if(32&i.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=i.s;n.n=void 0;i.s.n=n;i.s=n}return n}}function c(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}c.prototype.h=function(){return!0};c.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};c.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};c.prototype.subscribe=function(t){const n=this;return S((function(){const e=n.value,i=32&this.f;this.f&=-33;try{t(e)}finally{this.f|=i}}))};c.prototype.valueOf=function(){return this.value};c.prototype.toString=function(){return this.value+""};c.prototype.toJSON=function(){return this.value};c.prototype.peek=function(){return this.v};Object.defineProperty(c.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(e){if(i instanceof v)!function(){throw new Error("Computed cannot have side-effects")}();if(e!==this.v){if(f>100)t();this.v=e;this.i++;l++;u++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function h(t){return new c(t)}function a(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function p(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function d(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function v(t){c.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(v.prototype=new c).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!a(this)){this.f&=-2;return!0}const t=i;try{p(this);i=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}i=t;d(this);this.f&=-2;return!0};v.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}c.prototype.S.call(this,t)};v.prototype.U=function(t){if(void 0!==this.t){c.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};v.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};v.prototype.peek=function(){if(!this.h())t();if(16&this.f)throw this.v;return this.v};Object.defineProperty(v.prototype,"value",{get(){if(1&this.f)t();const n=s(this);this.h();if(void 0!==n)n.i=this.i;if(16&this.f)throw this.v;return this.v}});function y(t){return new v(t)}function m(t){const e=t.u;t.u=void 0;if("function"==typeof e){u++;const _=i;i=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;g(t);throw n}finally{i=_;n()}}}function g(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;m(t)}function b(t){if(i!==this)throw new Error("Out-of-order effect");d(this);i=t;this.f&=-2;if(8&this.f)g(this);n()}function k(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}k.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};k.prototype.S=function(){if(1&this.f)t();this.f|=1;this.f&=-9;m(this);p(this);u++;const n=i;i=this;return b.bind(this,n)};k.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=_;_=this}};k.prototype.d=function(){this.f|=8;if(!(1&this.f))g(this)};function S(t){const n=new k(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var x,w,C,E,U,H,N,P,$,D={},T=[],V=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,A=Array.isArray;function F(t,n){for(var e in n)t[e]=n[e];return t}function M(t){var n=t.parentNode;n&&n.removeChild(t)}function W(t,n,e){var i,_,o,r={};for(o in n)"key"==o?i=n[o]:"ref"==o?_=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?x.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return O(t,r,i,_,null)}function O(t,n,e,i,_){var o={type:t,props:n,key:e,ref:i,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,__h:null,constructor:void 0,__v:null==_?++C:_};return null==_&&null!=w.vnode&&w.vnode(o),o}function L(){return{current:null}}function R(t){return t.children}function I(t,n){this.props=t,this.context=n}function j(t,n){if(null==n)return t.__?j(t.__,t.__.__k.indexOf(t)+1):null;for(var e;nn&&U.sort(P));G.__r=0}function z(t,n,e,i,_,o,r,u,f,l,s){var c,h,a,p,d,v,y,m,g,b,k=0,S=i&&i.__k||T,x=S.length,w=x,C=n.length;for(e.__k=[],c=0;c0?O(p.type,p.props,p.key,p.ref?p.ref:null,p.__v):p)&&(p.__=e,p.__b=e.__b+1,-1===(m=X(p,S,y=c+k,w))?a=D:(a=S[m]||D,S[m]=void 0,w--),it(t,p,a,_,o,r,u,f,l,s),d=p.__e,(h=p.ref)&&a.ref!=h&&(a.ref&&rt(a.ref,null,p),s.push(h,p.__c||d,p)),null!=d&&(null==v&&(v=d),b=!(g=a===D||null===a.__v)&&m===y,g?-1==m&&k--:m!==y&&(m===y+1?(k++,b=!0):m>y?w>C-y?(k+=m-y,b=!0):k--:k=m(null!=f?1:0))for(;r>=0||u=0){if((f=n[r])&&_==f.key&&o===f.type)return r;r--}if(u2&&(u.children=arguments.length>3?x.call(arguments,2):e),O(t.type,u,i||t.key,_||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+$++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,i;return this.getChildContext||(e=[],(i={})[n]=this,this.getChildContext=function(){return i},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,q(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}x=T.slice,w={__e:function(t,n,e,i){for(var _,o,r;n=n.__;)if((_=n.__c)&&!_.__)try{if((o=_.constructor)&&null!=o.getDerivedStateFromError&&(_.setState(o.getDerivedStateFromError(t)),r=_.__d),null!=_.componentDidCatch&&(_.componentDidCatch(t,i||{}),r=_.__d),r)return _.__E=_}catch(n){t=n}throw t}},C=0,E=function(t){return null!=t&&void 0===t.constructor},I.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=F({},this.state),"function"==typeof t&&(t=t(F({},e),this.props)),t&&F(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),q(this))},I.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),q(this))},I.prototype.render=R,U=[],N="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},G.__r=0,$=0;var at,pt,dt,vt,yt=0,mt=[],gt=[],bt=w.__b,kt=w.__r,St=w.diffed,xt=w.__c,wt=w.unmount;function Ct(t,n){w.__h&&w.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({__V:gt}),e.__[t]}function Et(t){return yt=1,Ut(Bt,t)}function Ut(t,n,e){var i=Ct(at++,2);if(i.t=t,!i.__c&&(i.__=[e?e(n):Bt(void 0,n),function(t){var n=i.__N?i.__N[0]:i.__[0],e=i.t(n,t);n!==e&&(i.__N=[e,i.__[1]],i.__c.setState({}))}],i.__c=pt,!pt.u)){var _=function(t,n,e){if(!i.__c.__H)return!0;var _=i.__c.__H.__.filter((function(t){return t.__c}));if(_.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return _.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&i.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var i=o;o=void 0,_(t,n,e),o=i}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=_}return i.__N||i.__}function Ht(t,n){var e=Ct(at++,3);!w.__s&&jt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ct(at++,4);!w.__s&&jt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function Pt(t){return yt=5,Dt((function(){return{current:t}}),[])}function $t(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ct(at++,7);return jt(e.__H,n)?(e.__V=t(),e.i=n,e.__h=t,e.__V):e.__}function Tt(t,n){return yt=8,Dt((function(){return t}),n)}function Vt(t){var n=pt.context[t.__c],e=Ct(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function At(t,n){w.useDebugValue&&w.useDebugValue(n?n(t):t)}function Ft(t){var n=Ct(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,i){n.__&&n.__(t,i),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Mt(){var t=Ct(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Wt(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Rt),t.__H.__h.forEach(It),t.__H.__h=[]}catch(u){t.__H.__h=[],w.__e(u,t.__v)}}w.__b=function(t){pt=null,bt&&bt(t)},w.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.__V=gt,t.__N=t.i=void 0}))):(n.__h.forEach(Rt),n.__h.forEach(It),n.__h=[],at=0)),dt=pt},w.diffed=function(t){St&&St(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===w.requestAnimationFrame||((vt=w.requestAnimationFrame)||Lt)(Wt)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.__V!==gt&&(t.__=t.__V),t.i=void 0,t.__V=gt}))),dt=pt=null},w.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Rt),t.__h=t.__h.filter((function(t){return!t.__||It(t)}))}catch(s){n.some((function(t){t.__h&&(t.__h=[])})),n=[],w.__e(s,t.__v)}})),xt&&xt(t,n)},w.unmount=function(t){wt&&wt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Rt(t)}catch(t){n=t}})),e.__H=void 0,n&&w.__e(n,e.__v))};var Ot="function"==typeof requestAnimationFrame;function Lt(t){var n,e=function(){clearTimeout(i),Ot&&cancelAnimationFrame(n),setTimeout(t)},i=setTimeout(e,100);Ot&&(n=requestAnimationFrame(e))}function Rt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function It(t){var n=pt;t.__c=t.__(),pt=n}function jt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function qt(t,n){w[t]=n.bind(null,w[t]||(()=>{}))}let Gt,zt;function Jt(t){if(zt)zt();zt=t&&t.S()}function Kt({data:t}){const n=Xt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!E(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return y(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Kt.displayName="_st";Object.defineProperties(c.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Kt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});qt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let i in e){if("children"===i)continue;let _=e[i];if(_ instanceof c){if(!t)n.__np=t={};t[i]=_;e[i]=_.peek()}}}t(n)});qt("__r",(t,n)=>{Jt();let e,i=n.__c;if(i){i.__$f&=-2;e=i.__$u;if(void 0===e)i.__$u=e=function(t){let n;S((function(){n=this}));n.c=()=>{i.__$f|=1;i.setState({})};return n}()}Gt=i;Jt(e);t(n)});qt("__e",(t,n,e,i)=>{Jt();Gt=void 0;t(n,e,i)});qt("diffed",(t,n)=>{Jt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,i=n.props;if(t){let n=e.U;if(n)for(let e in n){let i=n[e];if(void 0!==i&&!(e in t)){i.d();n[e]=void 0}}else{n={};e.U=n}for(let _ in t){let o=n[_],r=t[_];if(void 0===o){o=Qt(e,_,r,i);n[_]=o}else o.o(r,i)}}}t(n)});function Qt(t,n,e,i){const _=n in t&&void 0===t.ownerSVGElement,o=h(e);return{o:(t,n)=>{o.value=t;i=n},d:S(()=>{const e=o.value.value;if(i[n]!==e){i[n]=e;if(_)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}qt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});qt("__h",(t,n,e,i)=>{if(i<3||9===i)n.__$f|=2;t(n,e,i)});I.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let i in n)return!0;for(let i in t)if("__source"!==i&&t[i]!==this.props[i])return!0;for(let i in this.props)if(!(i in t))return!0;return!1};function Xt(t){return Dt(()=>h(t),[])}function Yt(t){const n=Pt(t);n.current=t;Gt.__$f|=4;return Dt(()=>y(()=>n.current()),[])}function Zt(t){const n=Pt(t);n.current=t;Ht(()=>S(()=>n.current()),[])}var tn=function(t,n,e,i){var _;n[0]=0;for(var o=1;o=5&&((_||!t&&5===i)&&(r.push(i,0,_,e),i=6),t&&(r.push(i,t,0,e),i=6)),_=""},f=0;f"===n?(i=1,_=""):_=n+_[0]:o?n===o?o="":_+=n:'"'===n||"'"===n?o=n:">"===n?(u(),i=1):i&&("="===n?(i=5,e=_,_=""):"/"===n&&(i<5||">"===t[f][l+1])?(u(),3===i&&(r=r[0]),i=r,(r=r[0]).push(2,0,i),i=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),i=2):_+=n),3===i&&"!--"===_&&(i=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var _n=en.bind(W);export{I as Component,R as Fragment,c as Signal,e as batch,ct as cloneElement,y as computed,ht as createContext,W as createElement,L as createRef,S as effect,W as h,_n as html,st as hydrate,E as isValidElement,w as options,lt as render,h as signal,K as toChildArray,r as untracked,Tt as useCallback,Yt as useComputed,Vt as useContext,At as useDebugValue,Ht as useEffect,Ft as useErrorBoundary,Mt as useId,$t as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ut as useReducer,Pt as useRef,Xt as useSignal,Zt as useSignalEffect,Et as useState}; diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs new file mode 100644 index 0000000000000000000000000000000000000000..3f1b255c262a36b74d7f4e57090526247e3a9883 --- /dev/null +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -0,0 +1,112 @@ +const SPACE_RULE = '" "?'; + +const PRIMITIVE_RULES = { + boolean: '("true" | "false") space', + number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', + integer: '("-"? ([0-9] | [1-9] [0-9]*)) space', + string: ` "\\"" ( + [^"\\\\] | + "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) + )* "\\"" space`, + null: '"null" space', +}; + +const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g; +const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g; +const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}; + +export class SchemaConverter { + constructor(propOrder) { + this._propOrder = propOrder || {}; + this._rules = new Map(); + this._rules.set('space', SPACE_RULE); + } + + _formatLiteral(literal) { + const escaped = JSON.stringify(literal).replace( + GRAMMAR_LITERAL_ESCAPE_RE, + m => GRAMMAR_LITERAL_ESCAPES[m] + ); + return `"${escaped}"`; + } + + _addRule(name, rule) { + let escName = name.replace(INVALID_RULE_CHARS_RE, '-'); + let key = escName; + + if (this._rules.has(escName)) { + if (this._rules.get(escName) === rule) { + return key; + } + + let i = 0; + while (this._rules.has(`${escName}${i}`)) { + i += 1; + } + key = `${escName}${i}`; + } + + this._rules.set(key, rule); + return key; + } + + visit(schema, name) { + const schemaType = schema.type; + const ruleName = name || 'root'; + + if (schema.oneOf || schema.anyOf) { + const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) => + this.visit(altSchema, `${name}${name ? "-" : ""}${i}`) + ).join(' | '); + + return this._addRule(ruleName, rule); + } else if ('const' in schema) { + return this._addRule(ruleName, this._formatLiteral(schema.const)); + } else if ('enum' in schema) { + const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | '); + return this._addRule(ruleName, rule); + } else if (schemaType === 'object' && 'properties' in schema) { + // TODO: `required` keyword (from python implementation) + const propOrder = this._propOrder; + const propPairs = Object.entries(schema.properties).sort((a, b) => { + // sort by position in prop_order (if specified) then by key + const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity; + const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity; + return orderA - orderB || a[0].localeCompare(b[0]); + }); + + let rule = '"{" space'; + propPairs.forEach(([propName, propSchema], i) => { + const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`); + if (i > 0) { + rule += ' "," space'; + } + rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`; + }); + rule += ' "}" space'; + + return this._addRule(ruleName, rule); + } else if (schemaType === 'array' && 'items' in schema) { + // TODO `prefixItems` keyword (from python implementation) + const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`); + const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`; + return this._addRule(ruleName, rule); + } else { + if (!PRIMITIVE_RULES[schemaType]) { + throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`); + } + return this._addRule( + ruleName === 'root' ? 'root' : schemaType, + PRIMITIVE_RULES[schemaType] + ); + } + } + + formatGrammar() { + let grammar = ''; + this._rules.forEach((rule, name) => { + grammar += `${name} ::= ${rule}\n`; + }); + return grammar; + } +} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 10ae264f516f4e80af02b7c320fa45a24cc06669..ebd7f2fc579e992245574b4f58e2b7abe24db4f7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -15,6 +15,9 @@ #include "index.html.hpp" #include "index.js.hpp" #include "completion.js.hpp" +#include "json-schema-to-grammar.mjs.hpp" + +#include #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -93,7 +96,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_str(ctx, *begin); + ret += llama_token_to_piece(ctx, *begin); } return ret; } @@ -115,16 +118,17 @@ static void server_log(const char *level, const char *function, int line, } const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); - fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); + printf("%.*s\n", (int)str.size(), str.data()); fflush(stdout); } // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_str(ctx, token); - // if first bit is 1, meaning it's a partial character - if (out.size() > 0 && (out[0] & 0x80) == 0x80) + std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); + // if the size is 1 and first bit is 1, meaning it's a partial character + // (size > 1 meaning it's already a known token) + if (out.size() == 1 && (out[0] & 0x80) == 0x80) { std::stringstream ss; ss << std::hex << (out[0] & 0xff); @@ -135,7 +139,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c } // convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context *ctx, const std::vector probs) +static json probs_vector_to_json(const llama_context *ctx, const std::vector & probs) { json out = json::array(); for (const auto &prob : probs) @@ -189,6 +193,7 @@ struct llama_server_context size_t n_past = 0; size_t n_remain = 0; + json prompt; std::vector embd; std::vector last_n_tokens; @@ -196,6 +201,7 @@ struct llama_server_context llama_context *ctx = nullptr; gpt_params params; + grammar_parser::parse_state parsed_grammar; llama_grammar *grammar = nullptr; bool truncated = false; @@ -241,10 +247,13 @@ struct llama_server_context stopped_limit = false; stopping_word = ""; multibyte_pending = 0; - grammar = nullptr; - n_remain = 0; n_past = 0; + + if (grammar != nullptr) { + llama_grammar_free(grammar); + grammar = nullptr; + } } bool loadModel(const gpt_params ¶ms_) @@ -262,11 +271,54 @@ struct llama_server_context return true; } + std::vector tokenize(const json & json_prompt, bool add_bos) const + { + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. + std::vector prompt_tokens; + + if (json_prompt.is_array()) + { + bool first = true; + for (const auto& p : json_prompt) + { + if (p.is_string()) + { + auto s = p.template get(); + std::vector p; + if (first) + { + p = ::llama_tokenize(ctx, s, add_bos); + first = false; + } + else + { + p = ::llama_tokenize(ctx, s, false); + } + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } + else + { + if (first) + { + first = false; + } + prompt_tokens.push_back(p.template get()); + } + } + } + else + { + auto s = json_prompt.template get(); + prompt_tokens = ::llama_tokenize(ctx, s, add_bos); + } + + return prompt_tokens; + } + bool loadGrammar() { if (!params.grammar.empty()) { - grammar_parser::parse_state parsed_grammar; - parsed_grammar = grammar_parser::parse(params.grammar.c_str()); // will be empty (default) if there are parse errors if (parsed_grammar.rules.empty()) { @@ -276,7 +328,7 @@ struct llama_server_context grammar_parser::print_grammar(stderr, parsed_grammar); { - auto it = params.logit_bias.find(llama_token_eos()); + auto it = params.logit_bias.find(llama_token_eos(ctx)); if (it != params.logit_bias.end() && it->second == -INFINITY) { LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {}); } @@ -291,8 +343,8 @@ struct llama_server_context void loadPrompt() { - params.prompt.insert(0, 1, ' '); // always add a first space - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + auto prompt_tokens = tokenize(prompt, true); // always add BOS + num_prompt_tokens = prompt_tokens.size(); if (params.n_keep < 0) @@ -399,7 +451,7 @@ struct llama_server_context if (params.n_predict == 0) { has_next_token = false; - result.tok = llama_token_eos(); + result.tok = llama_token_eos(ctx); return result; } @@ -439,7 +491,7 @@ struct llama_server_context llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; // Apply penalties - float nl_logit = logits[llama_token_nl()]; + float nl_logit = logits[llama_token_nl(ctx)]; auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); llama_sample_repetition_penalty(ctx, &candidates_p, last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, @@ -449,7 +501,7 @@ struct llama_server_context last_n_repeat, alpha_frequency, alpha_presence); if (!penalize_nl) { - logits[llama_token_nl()] = nl_logit; + logits[llama_token_nl(ctx)] = nl_logit; } if (grammar != nullptr) { @@ -512,9 +564,9 @@ struct llama_server_context // decrement remaining sampling budget --n_remain; - if (!embd.empty() && embd.back() == llama_token_eos()) + if (!embd.empty() && embd.back() == llama_token_eos(ctx)) { - // stopping_word = llama_token_to_str(ctx, embd.back()); + // stopping_word = llama_token_to_piece(ctx, embd.back()); has_next_token = false; stopped_eos = true; LOG_VERBOSE("eos token found", {}); @@ -559,9 +611,9 @@ struct llama_server_context completion_token_output doCompletion() { - const completion_token_output token_with_probs = nextToken(); + auto token_with_probs = nextToken(); - const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok); + const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; if (params.n_probs > 0) @@ -642,52 +694,50 @@ struct llama_server_context static void server_print_usage(const char *argv0, const gpt_params ¶ms, const server_params &sparams) { - fprintf(stdout, "usage: %s [options]\n", argv0); - fprintf(stdout, "\n"); - fprintf(stdout, "options:\n"); - fprintf(stdout, " -h, --help show this help message and exit\n"); - fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); - fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps); - fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); - fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); - fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); + printf("usage: %s [options]\n", argv0); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) { - fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } if (llama_mmap_supported()) { - fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } + printf(" --numa attempt optimizations that help on some NUMA systems\n"); #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); - fprintf(stdout, " number of layers to store in VRAM\n"); - fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); - fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); - fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); - fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n"); + printf(" -nommq, --no-mul-mat-q\n"); + printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); + printf(" Not recommended since this is both slower and uses more VRAM.\n"); #endif - fprintf(stdout, " -m FNAME, --model FNAME\n"); - fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); - fprintf(stdout, " -a ALIAS, --alias ALIAS\n"); - fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n"); - fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); - fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port); - fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); - fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); - fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); - fprintf(stdout, "\n"); + printf(" -m FNAME, --model FNAME\n"); + printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -a ALIAS, --alias ALIAS\n"); + printf(" set an alias for the model, will be added as `model` field in completion response\n"); + printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); + printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); + printf("\n"); } static void server_params_parse(int argc, char **argv, server_params &sparams, @@ -770,23 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_ctx = std::stoi(argv[i]); } - else if (arg == "-gqa" || arg == "--gqa") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.n_gqa = std::stoi(argv[i]); - } - else if (arg == "-eps" || arg == "--rms-norm-eps") { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.rms_norm_eps = std::stof(argv[i]); - } else if (arg == "--rope-freq-base") { if (++i >= argc) @@ -882,12 +915,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {}); #endif // GGML_USE_CUBLAS } - else if (arg == "--mul-mat-q" || arg == "-mmq") + else if (arg == "--no-mul-mat-q" || arg == "-nommq") { #ifdef GGML_USE_CUBLAS - params.mul_mat_q = true; + params.mul_mat_q = false; #else - LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {}); + LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {}); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") @@ -938,6 +971,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, { params.use_mmap = false; } + else if (arg == "--numa") + { + params.numa = true; + } else if (arg == "--embedding") { params.embedding = true; @@ -960,7 +997,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, static json format_generation_settings(llama_server_context &llama) { - const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx)); const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); @@ -1003,10 +1040,10 @@ static json format_timings(llama_server_context &llama) { const auto timings = llama_get_timings(llama.ctx); - assert(timings.n_eval == llama.num_tokens_predicted); + assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted)); return json{ - {"prompt_n", timings.n_eval}, + {"prompt_n", timings.n_p_eval}, {"prompt_ms", timings.t_p_eval_ms}, {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval}, {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval}, @@ -1028,14 +1065,13 @@ static json format_final_response(llama_server_context &llama, const std::string {"tokens_predicted", llama.num_tokens_predicted}, {"tokens_evaluated", llama.num_prompt_tokens}, {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, + {"prompt", llama.prompt}, {"truncated", llama.truncated}, {"stopped_eos", llama.stopped_eos}, {"stopped_word", llama.stopped_word}, {"stopped_limit", llama.stopped_limit}, {"stopping_word", llama.stopping_word}, {"tokens_cached", llama.n_past}, - {"tokens_predicted", llama.num_tokens_predicted}, {"timings", format_timings(llama)}, }; @@ -1047,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string return res; } -static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector &probs) -{ +static json format_partial_response( + llama_server_context &llama, const std::string &content, const std::vector &probs +) { json res = json{ {"content", content}, {"stop", false}, @@ -1068,35 +1105,58 @@ static json format_tokenizer_response(const std::vector &tokens) {"tokens", tokens}}; } +static json format_detokenized_response(std::string content) +{ + return json{ + {"content", content}}; +} + +template +static T json_value(const json &body, const std::string &key, const T &default_value) +{ + // Fallback null to default value + return body.contains(key) && !body.at(key).is_null() + ? body.value(key, default_value) + : default_value; +} + static void parse_options_completion(const json &body, llama_server_context &llama) { gpt_params default_params; - llama.stream = body.value("stream", false); - llama.params.n_predict = body.value("n_predict", default_params.n_predict); - llama.params.top_k = body.value("top_k", default_params.top_k); - llama.params.top_p = body.value("top_p", default_params.top_p); - llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z); - llama.params.typical_p = body.value("typical_p", default_params.typical_p); - llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n); - llama.params.temp = body.value("temperature", default_params.temp); - llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty); - llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty); - llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty); - llama.params.mirostat = body.value("mirostat", default_params.mirostat); - llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau); - llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta); - llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl); - llama.params.n_keep = body.value("n_keep", default_params.n_keep); - llama.params.seed = body.value("seed", default_params.seed); - llama.params.prompt = body.value("prompt", default_params.prompt); - llama.params.grammar = body.value("grammar", default_params.grammar); - llama.params.n_probs = body.value("n_probs", default_params.n_probs); + llama.stream = json_value(body, "stream", false); + llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict); + llama.params.top_k = json_value(body, "top_k", default_params.top_k); + llama.params.top_p = json_value(body, "top_p", default_params.top_p); + llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z); + llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p); + llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n); + llama.params.temp = json_value(body, "temperature", default_params.temp); + llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty); + llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty); + llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty); + llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat); + llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau); + llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta); + llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl); + llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep); + llama.params.seed = json_value(body, "seed", default_params.seed); + llama.params.grammar = json_value(body, "grammar", default_params.grammar); + llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs); + + if (body.count("prompt") != 0) + { + llama.prompt = body["prompt"]; + } + else + { + llama.prompt = ""; + } llama.params.logit_bias.clear(); - if (body.value("ignore_eos", false)) + if (json_value(body, "ignore_eos", false)) { - llama.params.logit_bias[llama_token_eos()] = -INFINITY; + llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY; } const auto &logit_bias = body.find("logit_bias"); @@ -1156,6 +1216,63 @@ static void log_server_request(const Request &req, const Response &res) }); } +static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) { + return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx); +} + +// Function matching type llama_beam_search_callback_fn_t. +// Custom callback example is called each time the beams lengths increase: +// * Show progress by printing ',' following by number of convergent beam tokens if any. +// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. +// This is also called when the stop condition is met. +// Collect tokens into std::vector response which is pointed to by callback_data. +static void beam_search_callback(void *callback_data, llama_beams_state beams_state) { + auto & llama = *static_cast(callback_data); + // Mark beams as EOS as needed. + for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { + llama_beam_view& beam_view = beams_state.beam_views[i]; + if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) { + beam_view.eob = true; + } + } + printf(","); // Show progress + if (const size_t n = beams_state.common_prefix_length) { + llama.generated_token_probs.resize(llama.generated_token_probs.size() + n); + assert(0u < beams_state.n_beams); + const llama_token * tokens = beams_state.beam_views[0].tokens; + const auto map = [](llama_token tok) { return completion_token_output{{},tok}; }; + std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map); + printf("%zu", n); + } + fflush(stdout); +#if 0 // DEBUG: print current beams for this iteration + std::cout << "\n\nCurrent beams:\n"; + for (size_t i=0 ; i < beams_state.n_beams ; ++i) { + std::cout << "beams["<(&completion_js), completion_js_len, "application/javascript"); return false; }); + // this is only called if no index.html is found in the public --path + svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res) + { + res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript"); + return false; }); + svr.Post("/completion", [&llama](const Request &req, Response &res) { auto lock = llama.lock(); @@ -1232,25 +1355,39 @@ int main(int argc, char **argv) llama.beginCompletion(); if (!llama.stream) { - size_t stop_pos = std::string::npos; + if (llama.params.n_beams) { + // Fill llama.generated_token_probs vector with final beam. + llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams, + llama.n_past, llama.n_remain, llama.params.n_threads); + // Translate llama.generated_token_probs to llama.generated_text. + append_to_generated_text_from_generated_token_probs(llama); + } else { + size_t stop_pos = std::string::npos; - while (llama.has_next_token) { - const completion_token_output token_with_probs = llama.doCompletion(); - const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok); + while (llama.has_next_token) { + const completion_token_output token_with_probs = llama.doCompletion(); + const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok); - stop_pos = llama.findStoppingStrings(llama.generated_text, - token_text.size(), STOP_FULL); - } + stop_pos = llama.findStoppingStrings(llama.generated_text, + token_text.size(), STOP_FULL); + } - if (stop_pos == std::string::npos) { - stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); + if (stop_pos == std::string::npos) { + stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); + } + if (stop_pos != std::string::npos) { + llama.generated_text.erase(llama.generated_text.begin() + stop_pos, + llama.generated_text.end()); + } } - if (stop_pos != std::string::npos) { - llama.generated_text.erase(llama.generated_text.begin() + stop_pos, - llama.generated_text.end()); + + auto probs = llama.generated_token_probs; + if (llama.params.n_probs > 0 && llama.stopped_word) { + const std::vector stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); + probs = std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); } - const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs); + const json data = format_final_response(llama, llama.generated_text, probs); llama_print_timings(llama.ctx); @@ -1263,59 +1400,90 @@ int main(int argc, char **argv) while (llama.has_next_token) { const completion_token_output token_with_probs = llama.doCompletion(); - const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok); - if (llama.multibyte_pending > 0) { + if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) { continue; } + const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok); size_t pos = std::min(sent_count, llama.generated_text.size()); const std::string str_test = llama.generated_text.substr(pos); + bool is_stop_full = false; size_t stop_pos = llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); if (stop_pos != std::string::npos) { + is_stop_full = true; llama.generated_text.erase( llama.generated_text.begin() + pos + stop_pos, llama.generated_text.end()); pos = std::min(sent_count, llama.generated_text.size()); } else { + is_stop_full = false; stop_pos = llama.findStoppingStrings(str_test, token_text.size(), STOP_PARTIAL); } - const std::string to_send = llama.generated_text.substr(pos, stop_pos); - sent_count += to_send.size(); + if ( + stop_pos == std::string::npos || + // Send rest of the text if we are at the end of the generation + (!llama.has_next_token && !is_stop_full && stop_pos > 0) + ) { + const std::string to_send = llama.generated_text.substr(pos, std::string::npos); + + sent_count += to_send.size(); + + std::vector probs_output = {}; + + if (llama.params.n_probs > 0) { + const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); + size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); + size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); + if (probs_pos < probs_stop_pos) { + probs_output = std::vector(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos); + } + sent_token_probs_index = probs_stop_pos; + } - std::vector probs_output = {}; + const json data = format_partial_response(llama, to_send, probs_output); - if (llama.params.n_probs > 0) { - const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); - size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); - size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); - if (probs_pos < probs_stop_pos) { - probs_output = std::vector(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos); - } - sent_token_probs_index = probs_stop_pos; - } + const std::string str = + "data: " + + data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; - const json data = llama.has_next_token - ? format_partial_response(llama, to_send, probs_output) - // Generation is done, send extra information. - : format_final_response(llama, to_send, llama.generated_token_probs); + LOG_VERBOSE("data stream", { + { "to_send", str } + }); - const std::string str = - "data: " + - data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; + if (!sink.write(str.data(), str.size())) { + LOG_VERBOSE("stream closed", {}); + llama_print_timings(llama.ctx); + return false; + } + } - LOG_VERBOSE("data stream", { - { "to_send", str } - }); + if (!llama.has_next_token) { + // Generation is done, send extra information. + const json data = format_final_response( + llama, + "", + std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index) + ); + + const std::string str = + "data: " + + data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + + LOG_VERBOSE("data stream", { + { "to_send", str } + }); - if (!sink.write(str.data(), str.size())) { - LOG_VERBOSE("stream closed", {}); - llama_print_timings(llama.ctx); - return false; + if (!sink.write(str.data(), str.size())) { + LOG_VERBOSE("stream closed", {}); + llama_print_timings(llama.ctx); + return false; + } } } @@ -1343,11 +1511,29 @@ int main(int argc, char **argv) auto lock = llama.lock(); const json body = json::parse(req.body); - const std::string content = body.value("content", ""); - const std::vector tokens = llama_tokenize(llama.ctx, content, false); + std::vector tokens; + if (body.count("content") != 0) + { + tokens = llama.tokenize(body["content"], false); + } const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); + svr.Post("/detokenize", [&llama](const Request &req, Response &res) + { + auto lock = llama.lock(); + + const json body = json::parse(req.body); + std::string content; + if (body.count("tokens") != 0) + { + const std::vector tokens = body["tokens"]; + content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); + } + + const json data = format_detokenized_response(content); + return res.set_content(data.dump(), "application/json"); }); + svr.Post("/embedding", [&llama](const Request &req, Response &res) { auto lock = llama.lock(); @@ -1356,7 +1542,14 @@ int main(int argc, char **argv) llama.rewind(); llama_reset_timings(llama.ctx); - llama.params.prompt = body.value("content", ""); + if (body.count("content") != 0) + { + llama.prompt = body["content"]; + } + else + { + llama.prompt = ""; + } llama.params.n_predict = 0; llama.loadPrompt(); llama.beginCompletion(); @@ -1369,7 +1562,7 @@ int main(int argc, char **argv) svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { - const auto * fmt = "500 Internal Server Error\n%s"; + const char fmt[] = "500 Internal Server Error\n%s"; char buf[BUFSIZ]; try { std::rethrow_exception(std::move(ep)); @@ -1385,7 +1578,7 @@ int main(int argc, char **argv) { if (res.status == 400) { res.set_content("Invalid request", "text/plain"); - } else { + } else if (res.status != 500) { res.set_content("File Not Found", "text/plain"); res.status = 404; } }); @@ -1404,7 +1597,7 @@ int main(int argc, char **argv) svr.set_base_dir(sparams.public_path); // to make it ctrl+clickable: - fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); + printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); LOG_INFO("HTTP server listening", { {"hostname", sparams.hostname}, diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 0ac9cb03a8eca4494f38e6c57efc4909d63ffb9f..7da5ff6f3ac041e4a737c63c0613b64616f72e3a 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -3,6 +3,3 @@ add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) -if(TARGET BUILD_INFO) - add_dependencies(${TARGET} BUILD_INFO) -endif() diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 97137a6584aa3d6a0741e9689ef0d4dab6a8a804..440d22ecfb4a8917816dd7ecb4967be858ddc1a8 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,181 +1,124 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - #include "common.h" #include "llama.h" -#include "build-info.h" -#include -#include #include #include -#include -#include -#include -#include #include #include -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include -#include -#elif defined (_WIN32) -#define WIN32_LEAN_AND_MEAN -#define NOMINMAX -#include -#include -#endif - - - -int main(int argc, char ** argv) -{ +int main(int argc, char ** argv) { gpt_params params; - //--------------------------------- - // Print help : - //--------------------------------- - - if ( argc == 1 || argv[1][0] == '-' ) - { - printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); + if (argc == 1 || argv[1][0] == '-') { + printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); return 1 ; } - //--------------------------------- - // Load parameters : - //--------------------------------- - - if ( argc >= 2 ) - { + if (argc >= 2) { params.model = argv[1]; } - if ( argc >= 3 ) - { + if (argc >= 3) { params.prompt = argv[2]; } - if ( params.prompt.empty() ) - { + if (params.prompt.empty()) { params.prompt = "Hello my name is"; } - //--------------------------------- - // Init LLM : - //--------------------------------- + // init LLM llama_backend_init(params.numa); - llama_model * model; - llama_context * ctx; + llama_context_params ctx_params = llama_context_default_params(); - std::tie(model, ctx) = llama_init_from_gpt_params( params ); + llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); - if ( model == NULL ) - { - fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); + if (model == NULL) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } - //--------------------------------- - // Tokenize the prompt : - //--------------------------------- + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + + // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize( ctx , params.prompt , true ); + tokens_list = ::llama_tokenize(ctx, params.prompt, true); - const int max_context_size = llama_n_ctx( ctx ); - const int max_tokens_list_size = max_context_size - 4 ; + const int max_context_size = llama_n_ctx(ctx); + const int max_tokens_list_size = max_context_size - 4; - if ( (int)tokens_list.size() > max_tokens_list_size ) - { - fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , - __func__ , (int)tokens_list.size() , max_tokens_list_size ); + if ((int) tokens_list.size() > max_tokens_list_size) { + fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); return 1; } - fprintf( stderr, "\n\n" ); - - // Print the tokens from the prompt : + fprintf(stderr, "\n\n"); - for( auto id : tokens_list ) - { - printf( "%s" , llama_token_to_str( ctx , id ) ); + for (auto id : tokens_list) { + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); } - fflush(stdout); + fflush(stderr); - - //--------------------------------- - // Main prediction loop : - //--------------------------------- + // main loop // The LLM keeps a contextual cache memory of previous token evaluation. // Usually, once this cache is full, it is required to recompute a compressed context based on previous // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist // example, we will just stop the loop once this cache is full or once an end of stream is detected. - while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) - { - //--------------------------------- - // Evaluate the tokens : - //--------------------------------- + const int n_gen = std::min(32, max_context_size); + + while (llama_get_kv_cache_token_count(ctx) < n_gen) { + // evaluate the transformer - if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) - { - fprintf( stderr, "%s : failed to eval\n" , __func__ ); + if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); return 1; } tokens_list.clear(); - //--------------------------------- - // Select the best prediction : - //--------------------------------- + // sample the next token llama_token new_token_id = 0; - auto logits = llama_get_logits( ctx ); - auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) + auto logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); std::vector candidates; - candidates.reserve( n_vocab ); + candidates.reserve(n_vocab); - for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) - { - candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - // Select it using the "Greedy sampling" method : - new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); - + new_token_id = llama_sample_token_greedy(ctx , &candidates_p); // is it an end of stream ? - if ( new_token_id == llama_token_eos() ) - { + if (new_token_id == llama_token_eos(ctx)) { fprintf(stderr, " [end of text]\n"); break; } - // Print the new token : - printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); - fflush( stdout ); - - // Push this new token for next evaluation : - tokens_list.push_back( new_token_id ); + // print the new token : + printf("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + fflush(stdout); - } // wend of main loop + // push this new token for next evaluation + tokens_list.push_back(new_token_id); + } - llama_free( ctx ); - llama_free_model( model ); + llama_free(ctx); + llama_free_model(model); llama_backend_free(); + fprintf(stderr, "\n\n"); + return 0; } - -// EOF diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c5c9456e6234428763f134d02d956e3693a028f --- /dev/null +++ b/examples/speculative/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET speculative) +add_executable(${TARGET} speculative.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp new file mode 100644 index 0000000000000000000000000000000000000000..aa904183fa2d8ebc77cd012972085d405e5ae09f --- /dev/null +++ b/examples/speculative/speculative.cpp @@ -0,0 +1,311 @@ +#include "build-info.h" + +#include "common.h" +#include "llama.h" +#include "grammar-parser.h" + +#include +#include +#include +#include + +int main(int argc, char ** argv) { + gpt_params params; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.model_draft.empty()) { + fprintf(stderr, "%s: error: --model-draft is required\n", __func__); + return 1; + } + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("speculative", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + + // init llama.cpp + llama_backend_init(params.numa); + + llama_model * model_tgt = NULL; + llama_model * model_dft = NULL; + + llama_context * ctx_tgt = NULL; + llama_context * ctx_dft = NULL; + + // load the target model + params.perplexity = true; // HACK: enable logits_all = true + std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); + + // load the draft model + params.model = params.model_draft; + params.n_gpu_layers = params.n_gpu_layers_draft; + std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + + // tokenize the prompt + std::vector inp; + inp = ::llama_tokenize(ctx_tgt, params.prompt, true); + + const int max_context_size = llama_n_ctx(ctx_tgt); + const int max_tokens_list_size = max_context_size - 4; + + if ((int) inp.size() > max_tokens_list_size) { + fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + return 1; + } + + fprintf(stderr, "\n\n"); + + for (auto id : inp) { + fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str()); + } + + fflush(stderr); + + const int n_input = inp.size(); + + const auto t_enc_start = ggml_time_us(); + + // eval the prompt with both models + llama_eval(ctx_tgt, inp.data(), int(inp.size() - 1), 0, params.n_threads); + llama_eval(ctx_tgt, &inp.back(), 1, inp.size() - 1, params.n_threads); + llama_eval(ctx_dft, inp.data(), int(inp.size()), 0, params.n_threads); + + const auto t_enc_end = ggml_time_us(); + + // the 2 models should have the same vocab + const int n_ctx = llama_n_ctx(ctx_tgt); + const int n_vocab = llama_n_vocab(ctx_tgt); + //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft)); + + // how many tokens to draft each time + int n_draft = params.n_draft; + + int n_predict = 0; + int n_drafted = 0; + int n_accept = 0; + + int n_past_tgt = inp.size(); + int n_past_dft = inp.size(); + + std::vector drafted; + + std::vector last_tokens(n_ctx); + std::fill(last_tokens.begin(), last_tokens.end(), 0); + + for (auto & id : inp) { + last_tokens.erase(last_tokens.begin()); + last_tokens.push_back(id); + } + + std::vector candidates; + candidates.reserve(n_vocab); + + // used to determine end of generation + bool has_eos = false; + + // grammar stuff + struct llama_grammar * grammar_dft = NULL; + struct llama_grammar * grammar_tgt = NULL; + + grammar_parser::parse_state parsed_grammar; + + // if requested - load the grammar, error checking is omitted for brevity + if (!params.grammar.empty()) { + parsed_grammar = grammar_parser::parse(params.grammar.c_str()); + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + return 1; + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + } + + const auto t_dec_start = ggml_time_us(); + + while (true) { + LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted)); + + int i_dft = 0; + + while (true) { + // sample from the target model + const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); + + // remember which tokens were sampled - used for repetition penalties during sampling + last_tokens.erase(last_tokens.begin()); + last_tokens.push_back(id); + + //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens)); + + const std::string token_str = llama_token_to_piece(ctx_tgt, id); + printf("%s", token_str.c_str()); + fflush(stdout); + + if (id == llama_token_eos(ctx_tgt)) { + has_eos = true; + } + + ++n_predict; + + // check if the draft matches the target + if (i_dft < (int) drafted.size() && id == drafted[i_dft]) { + LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); + ++n_accept; + ++n_past_tgt; + ++n_past_dft; + ++i_dft; + + continue; + } + + // the drafted token was rejected or we are out of drafted tokens + + if (i_dft < (int) drafted.size()) { + LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n", + i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str()); + } else { + LOG("out of drafted tokens\n"); + } + + llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); + ++n_past_dft; + + // heuristic for n_draft + { + const int n_draft_cur = (int) drafted.size(); + const bool all_accepted = i_dft == n_draft_cur; + + LOG("n_draft = %d\n", n_draft); + LOG("n_draft_cur = %d\n", n_draft_cur); + LOG("i_dft = %d\n", i_dft); + LOG("all_accepted = %d\n", all_accepted); + + if (all_accepted && n_draft == n_draft_cur) { + LOG(" - max drafted tokens accepted - n_draft += 8\n"); + n_draft = std::min(30, n_draft + 8); + } else if (all_accepted) { + LOG(" - partially drafted tokens accepted - no change\n"); + } else { + LOG(" - drafted token rejected - n_draft -= 1\n"); + n_draft = std::max(2, n_draft - 1); + } + } + + drafted.clear(); + drafted.push_back(id); + + break; + } + + if (n_predict > params.n_predict || has_eos) { + break; + } + + if (grammar_tgt) { + if (grammar_dft) { + llama_grammar_free(grammar_dft); + } + grammar_dft = llama_grammar_copy(grammar_tgt); + + LOG("copied target grammar to draft grammar\n"); + } + + // sample n_draft tokens from the draft model using greedy decoding + int n_past_cur = n_past_dft; + for (int i = 0; i < n_draft; ++i) { + float * logits = llama_get_logits(ctx_dft); + + candidates.clear(); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array cur_p = { candidates.data(), candidates.size(), false }; + + if (grammar_dft != NULL) { + llama_sample_grammar(ctx_dft, &cur_p, grammar_dft); + } + + // computes softmax and sorts the candidates + llama_sample_softmax(ctx_dft, &cur_p); + + for (int i = 0; i < 3; ++i) { + LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str()); + } + + // TODO: better logic? + if (cur_p.data[0].p < 2*cur_p.data[1].p) { + LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p); + break; + } + + // drafted token + const llama_token id = cur_p.data[0].id; + + drafted.push_back(id); + ++n_drafted; + + // no need to evaluate the last drafted token, since we won't use the result + if (i == n_draft - 1) { + break; + } + + // evaluate the drafted token on the draft model + llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); + ++n_past_cur; + + if (grammar_dft != NULL) { + llama_grammar_accept_token(ctx_dft, grammar_dft, id); + } + } + + // evaluate the target model on the drafted tokens + llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads); + ++n_past_tgt; + + // the first token is always proposed by the traget model before the speculation loop + drafted.erase(drafted.begin()); + } + + auto t_dec_end = ggml_time_us(); + + LOG_TEE("\n\n"); + + LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + + // TODO: make sure these numbers are computed correctly + LOG_TEE("\n"); + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_predict = %d\n", n_predict); + LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("n_accept = %d\n", n_accept); + LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + + LOG_TEE("\ndraft:\n"); + llama_print_timings(ctx_dft); + + LOG_TEE("\ntarget:\n"); + llama_print_timings(ctx_tgt); + + llama_free(ctx_tgt); + llama_free_model(model_tgt); + + llama_free(ctx_dft); + llama_free_model(model_dft); + + if (grammar_dft != NULL) { + llama_grammar_free(grammar_dft); + llama_grammar_free(grammar_tgt); + } + llama_backend_free(); + + fprintf(stderr, "\n\n"); + + return 0; +} diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md index 726ec47c0ce4f90d1a99d7a8ba3ee85c096a179a..f4ffcd9876c0c7a08c7ac883dbf0498e94280b30 100644 --- a/examples/train-text-from-scratch/README.md +++ b/examples/train-text-from-scratch/README.md @@ -8,15 +8,15 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s # train ./bin/train-text-from-scratch \ - --vocab-model ../models/ggml-vocab.bin \ + --vocab-model ../models/ggml-vocab-llama.gguf \ --ctx 64 --embd 256 --head 8 --layer 16 \ - --checkpoint-in chk-shakespeare-256x16.bin \ - --checkpoint-out chk-shakespeare-256x16.bin \ - --model-out ggml-shakespeare-256x16-f32.bin \ + --checkpoint-in chk-shakespeare-256x16.gguf \ + --checkpoint-out chk-shakespeare-256x16.gguf \ + --model-out ggml-shakespeare-256x16-f32.gguf \ --train-data "shakespeare.txt" \ - -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \ - --print-details-interval 0 --predict 16 --use-flash + -t 6 -b 16 --seed 1 --adam-iter 256 \ + --no-checkpointing # predict -./bin/main -m ggml-shakespeare-256x16-f32.bin +./bin/main -m ggml-shakespeare-256x16-f32.gguf ``` diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..a527d615304b8abb88d07dc21b84c3b0381040eb --- /dev/null +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -0,0 +1,495 @@ +#!/usr/bin/env python3 +# train-text-from-scratch checkpoint --> gguf conversion + +import argparse +import os +import struct +import sys +import numpy as np +from pathlib import Path + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf')) +import gguf + +# gguf constants +LLM_KV_OPTIMIZER_TYPE = "optimizer.type" +LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" +LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" +LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" +LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" +LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" +LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" +LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" +LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" +LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" +LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" +LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" +LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" +LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" + +LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" +LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" +LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" + +LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" +LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" +LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" +LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" +LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" +LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" + +LLM_KV_TRAINING_FILE_VERSION = "training.file_version" +LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" +LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" +LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" + +class Tensor: + def __init__(self, dtype='f', ne=None): + if ne is None: + ne = [] + self.dtype = dtype + self.ne = ne + self.nbytes = 0 + if self.dtype == 'f': + if len(self.ne) == 0: + self.nbytes = 0 + else: + self.nbytes = int(np.product(self.ne)) * 4 + else: + raise ValueError(f"Unhandled data type '{self.dtype}'") + + def load(self, data, offset): + nd = struct.unpack(' 0 else []) + + self.lbfgs_x = Tensor('f', [self.nx]) + self.lbfgs_xp = Tensor('f', [self.nx]) + self.lbfgs_g = Tensor('f', [self.nx]) + self.lbfgs_gp = Tensor('f', [self.nx]) + self.lbfgs_d = Tensor('f', [self.nx]) + self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) + self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) + self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) + + if self.type == 0: + # these tensors are stored, but we don't need their data + x = Tensor('f', [self.nx]) + g = Tensor('f', [self.nx]) + g2 = Tensor('f', [self.nx]) + mh = Tensor('f', [self.nx]) + vh = Tensor('f', [self.nx]) + + offset = x.load(data, offset) + offset = g.load(data, offset) + offset = g2.load(data, offset) + offset = self.adam_m.load(data, offset) + offset = self.adam_v.load(data, offset) + offset = mh.load(data, offset) + offset = vh.load(data, offset) + offset = self.adam_pf.load(data, offset) + + self.adam_fx_best = struct.unpack(' 0 else []) + + self.lbfgs_x = Tensor('f', [self.nx]) + self.lbfgs_xp = Tensor('f', [self.nx]) + self.lbfgs_g = Tensor('f', [self.nx]) + self.lbfgs_gp = Tensor('f', [self.nx]) + self.lbfgs_d = Tensor('f', [self.nx]) + self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) + self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) + self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) + + # forgot to save type in version 1: + # guess self.type from number of remaining bytes + size_type_0 = 12 + sum([t.max_storage_size() for t in + [self.adam_m, self.adam_v] + +([self.adam_pf] if (self.past > 0) else [])]) + size_type_1 = 24 + sum([t.max_storage_size() for t in + [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, + self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, + self.lbfgs_lmal, self.lbfgs_lmys, + self.lbfgs_lms, self.lbfgs_lmy] + +([self.lbfgs_pf] if (self.past > 0) else [])]) + # due to alignment padding the size might not by exact + # but the difference in size for both types is significant, + # so we can just use whichever is closest + remaining = len(data) - offset + if abs(remaining - size_type_0) < abs(remaining - size_type_1): + self.type = 0 + else: + self.type = 1 + + if self.type == 0: + offset = self.adam_m.load(data, offset) + offset = self.adam_v.load(data, offset) + offset = self.adam_pf.load(data,offset) + + self.adam_fx_best = struct.unpack(' 0: + self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) + + elif self.type == 1: + gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) + gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) + gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) + gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) + gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) + + self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) + self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) + self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) + self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) + self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) + if self.past > 0: + self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) + self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) + self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) + self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) + self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) + else: + raise ValueError('Unknown optimizer type') + +class ModelParams: + def __init__(self): + pass + + def load(self, data, offset): + self.n_vocab = struct.unpack(' #include @@ -16,8 +18,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; - struct random_normal_distribution { std::mt19937 gen; std::normal_distribution rd; @@ -62,17 +62,6 @@ float frand_uniform(struct random_uniform_distribution * rnd) { return rnd->rd(rnd->gen); } -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier switch (tensor->n_dims) { @@ -166,31 +155,20 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc return tensor; } -struct llama_vocab { - using id = int32_t; - using token = std::string; - - struct token_score { - token tok; - float score; - }; - - std::unordered_map token_to_id; - std::vector id_to_token; -}; - struct my_llama_hparams { uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_ctx = 512; uint32_t n_embd = 4096; - uint32_t n_mult = 4; uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; + uint32_t n_ff = 11008; - bool operator!=(const my_llama_hparams& other) const { - return memcmp(this, &other, sizeof(my_llama_hparams)); - } + // float f_norm_eps = 1e-5; // falcon + float f_norm_rms_eps = 1e-5; // llama + + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; }; struct my_llama_layer { @@ -212,17 +190,6 @@ struct my_llama_layer { struct ggml_tensor * w3; }; -struct my_llama_kv_cache { - struct ggml_context * ctx = NULL; - - struct ggml_tensor * k; - struct ggml_tensor * v; - - // llama_ctx_buffer buf; - - int n; // number of tokens currently in the cache -}; - struct my_llama_model { struct ggml_context * ctx = NULL; @@ -240,18 +207,91 @@ struct my_llama_model { uint32_t train_tokens = 0; }; -uint32_t get_n_ff(const struct my_llama_hparams* hparams) { - const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; - return n_ff; -} +// gguf constants +const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; +const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; +const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; +const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; +const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; +const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; +const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; +const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; +const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; +const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; +const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; +const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; +const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; +const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; +const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; +const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; +const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; +const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; + +const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; +const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; +const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; + +const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; + +const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version"; +const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"; +const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"; +const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"; + +// gguf constants (sync with gguf.py) + +const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; +const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; + +const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; +const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; +const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; +const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; +const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; +const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; +const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; +const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp +const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; + +const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model"; +const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens"; +const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"; +const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores"; +const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges"; +const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"; +const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"; +const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"; +const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"; +const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"; + +const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; +const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; +const char * LLM_TENSOR_OUTPUT = "output"; +const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; +const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; +const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; +const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; +const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; +const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; +const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; +const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; +const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); - printf("%s: n_mult: %d\n", __func__, params->n_mult); printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_ff: %d\n", __func__, params->n_ff); printf("%s: n_layer: %d\n", __func__, params->n_layer); printf("%s: n_rot: %d\n", __func__, params->n_rot); } @@ -262,8 +302,7 @@ void init_model(struct my_llama_model * model) { const uint32_t n_embd = hparams.n_embd; const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; - - const uint32_t n_ff = get_n_ff(&hparams); + const uint32_t n_ff = hparams.n_ff; struct ggml_context * ctx = model->ctx; @@ -271,20 +310,31 @@ void init_model(struct my_llama_model * model) { model->train_samples = 0; model->train_tokens = 0; + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + auto tn = [&tn_buf](const char * key) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); + return tn_buf.data(); + }; + auto tni = [&tn_buf](const char * key, int bid) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); + return tn_buf.data(); + }; + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); - ggml_set_name(model->norm, "norm.weight"); - ggml_set_name(model->output, "output.weight"); + ggml_set_name(model->tok_embeddings, tn(LLM_TENSOR_TOKEN_EMBD)); + ggml_set_name(model->norm, tn(LLM_TENSOR_OUTPUT_NORM)); + ggml_set_name(model->output, tn(LLM_TENSOR_OUTPUT)); model->layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - std::string layers_i = "layers." + std::to_string(i); - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); @@ -298,18 +348,18 @@ void init_model(struct my_llama_model * model) { layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str()); + ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i)); - ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str()); - ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str()); - ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str()); - ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str()); + ggml_set_name(layer.wq, tni(LLM_TENSOR_ATTN_Q, i)); + ggml_set_name(layer.wk, tni(LLM_TENSOR_ATTN_K, i)); + ggml_set_name(layer.wv, tni(LLM_TENSOR_ATTN_V, i)); + ggml_set_name(layer.wo, tni(LLM_TENSOR_ATTN_OUT, i)); - ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str()); + ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i)); - ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str()); - ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str()); - ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str()); + ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i)); + ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); + ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); } } @@ -368,267 +418,6 @@ void randomize_model(struct my_llama_model * model, int seed, float mean, float } } -bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) { - const auto & hparams = model->hparams; - - const uint32_t n_ctx = hparams.n_ctx; - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - - const int64_t n_mem = n_layer*n_ctx*n_batch; - const int64_t n_elements = n_embd*n_mem; - - // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); - - // struct ggml_init_params params; - // params.mem_size = cache.buf.size; - // params.mem_buffer = cache.buf.addr; - // params.no_alloc = false; - if (!cache->ctx) { - struct ggml_init_params params; - params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; - params.mem_buffer = NULL; - params.no_alloc = false; - - cache->ctx = ggml_init(params); - - if (!cache->ctx) { - fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); - return false; - } - } - - cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - - return true; -} - -struct ggml_tensor * forward( - struct my_llama_model * model, - struct my_llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - - const int N = n_tokens; - - struct my_llama_kv_cache& kv_self = *cache; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); - - struct ggml_tensor * kc = kv_self.k; - struct ggml_tensor * vc = kv_self.v; - - // inpL shape [n_embd,N,1,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Kcur shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - // wv shape [n_embd, n_embd, 1, 1] - // Vcur shape [n_embd, N, 1, 1] - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // kv_self.v shape [n_embd * n_ctx * n_layer, 1] - // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] - // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] - - /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } //*/ - - kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - vc = ggml_set_2d_inplace(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - } - - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Q shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // K shape [n_embd/n_head, n_past + N, n_head, 1] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - // KQ shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - - // split cached V into n_head heads - //// V shape [n_past + N, n_embd/n_head, n_head, 1] - // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] - struct ggml_tensor * V = - ggml_view_3d(ctx0, vc, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(vc), - n_ctx*ggml_element_size(vc)*n_embd/n_head, - il*n_ctx*ggml_element_size(vc)*n_embd); - - // KQV shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - // KQV_merged shape - - // cur = KQV_merged.contiguous().view(n_embd, N) - // cur shape [n_embd,N,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); - // cur = ggml_cpy(ctx0, - // KQV_merged, - // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection (no bias) - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N,1,1] - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - - // cur = ffn_norm*cur - // cur shape [n_embd,N,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - } - - // tmp shape [n_ff,N,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - - // SILU activation - // cur shape [n_ff,N,1,1] - cur = ggml_silu(ctx0, cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul(ctx0, cur, tmp); - - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - } - - // cur shape [n_embd,N,1,1] - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - // inpL shape [n_embd,N,1,1] - inpL = cur; - } - - // norm - { - - // inpL shape [n_embd,N,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // inpL = norm*inpL - // inpL shape [n_embd,N,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - //embeddings = inpL; - } - - // lm_head - // inpL shape [n_vocab,N,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; -} - void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); @@ -655,786 +444,222 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6 GGML_ASSERT(tensor->ne[3] == ne3); } -struct ggml_tensor * forward_batch( - struct my_llama_model * model, - struct my_llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past, - const int n_batch) { - - const int N = n_tokens; - - struct my_llama_kv_cache& kv_self = *cache; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); - memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); - - struct ggml_tensor * kc = kv_self.k; - struct ggml_tensor * vc = kv_self.v; - - // inpL shape [n_embd,N*n_batch,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - // wv shape [n_embd, n_embd, 1, 1] - // Vcur shape [N, n_embd, n_batch, 1] - struct ggml_tensor * Vcur = ggml_cont(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_mul_mat(ctx0, - model->layers[il].wv, - cur), - n_embd, N, n_batch), - 1, 0, 2, 3)); - assert_shape_3d(Vcur, N, n_embd, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] - // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il] - // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il] - - /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } //*/ - - kc = ggml_set_2d_inplace(ctx0, kc, - ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch), - ggml_element_size(kc)*n_embd*n_ctx, - (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past)); - vc = ggml_set_2d_inplace(ctx0, vc, - ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch), - ggml_element_size(vc)*n_ctx*n_embd, - ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx)); - - assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer); - assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer); - } - - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Q shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // K shape [n_embd/n_head, n_past + N, n_head, n_batch] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_4d(ctx0, - ggml_view_3d(ctx0, - kc, - n_embd, - (n_past + N), - n_batch, - n_embd*ggml_element_size(kc), - n_ctx*n_embd*ggml_element_size(kc), - il*n_batch*n_ctx*n_embd*ggml_element_size(kc)), - n_embd/n_head, n_head, n_past + N, n_batch), - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch); - - // K * Q - // KQ shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - assert_shape_4d(KQ, n_past + N, N, n_head, n_batch); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); - - // split cached V into n_head heads - // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] - // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il] - struct ggml_tensor * V = - ggml_view_4d(ctx0, vc, - n_past + N, n_embd/n_head, n_head, n_batch, - ggml_element_size(vc)*n_ctx, - ggml_element_size(vc)*n_ctx*n_embd/n_head, - ggml_element_size(vc)*n_ctx*n_embd, - il*n_batch*n_ctx*n_embd*ggml_element_size(vc)); - assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch); - - // KQV shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - // KQV_merged shape - - // cur = KQV_merged.contiguous().view(n_embd, N) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - // cur = ggml_cpy(ctx0, - // KQV_merged, - // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection (no bias) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N*n_batch,1,1] - struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} - // feed-forward network - { - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } +static size_t hash_find(void * hash_table[], void * p) { + size_t h = hash(p); - // tmp shape [n_ff,N*n_batch,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // SILU activation - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); + // linear probing + size_t i = h; + while (hash_table[i] != NULL && hash_table[i] != p) { + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // visited all hash table entries -> not found + return GGML_GRAPH_HASHTABLE_SIZE; } - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_add_inplace(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); - - // input for next layer - // inpL shape [n_embd,N*n_batch,1,1] - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); } + return i; +} - // norm - { - - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); +static bool hash_insert(void * hash_table[], void * p) { + //size_t h = hash(p); + size_t i = hash_find(hash_table, p); - assert_shape_2d(inpL, n_embd, N*n_batch); + GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - //embeddings = inpL; + if (hash_table[i] == p) { + return true; } - // lm_head - // inpL shape [n_vocab,N*n_batch,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); - - { - // inpL shape [n_vocab,N,n_batch,1] - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); - } - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; + // insert + GGML_ASSERT(hash_table[i] == NULL); + hash_table[i] = p; + return false; } -struct ggml_tensor * forward_batch_wo_cache( - struct my_llama_model * model, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_batch) { - - const int n_past = 0; - const int N = n_tokens; - - const auto & hparams = model->hparams; - //const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); - memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); - - // inpL shape [n_embd,N*n_batch,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - // Vcur shape [N, n_batch, n_embd/n_head, n_head] - struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); - assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); - - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Q shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // K shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * K = - ggml_permute(ctx0, - Kcur, - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); - - // K * Q - // KQ shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - assert_shape_4d(KQ, N, N, n_head, n_batch); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - assert_shape_4d(KQ_scaled, N, N, n_head, n_batch); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - assert_shape_4d(KQ_masked, N, N, n_head, n_batch); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch); - - // Vcur shape [N, n_batch, n_embd/n_head, n_head] - // V shape [N, n_embd/n_head, n_head, n_batch] - struct ggml_tensor * V = - ggml_permute(ctx0, - Vcur, - 0, 3, 1, 2); - assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); - - // KQV shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - // KQV_merged shape - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - - // projection (no bias) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N*n_batch,1,1] - struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // tmp shape [n_ff,N*n_batch,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // SILU activation - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } +static bool hash_contains(void * hash_table[], void * p) { + size_t i = hash_find(hash_table, p); + return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p); +} - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_add_inplace(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); +struct hash_map { + void * keys[GGML_GRAPH_HASHTABLE_SIZE]; + void * vals[GGML_GRAPH_HASHTABLE_SIZE]; +}; +//static const size_t HASH_MAP_SIZE = sizeof(struct hash_map); - // input for next layer - // inpL shape [n_embd,N*n_batch,1,1] - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); +struct hash_map * new_hash_map() { + struct hash_map * result = new struct hash_map; + for (int i=0; ikeys[i] = NULL; + result->vals[i] = NULL; } + return result; +}; - // norm - { - - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - assert_shape_2d(inpL, n_embd, N*n_batch); - - //embeddings = inpL; - } +void free_hash_map(struct hash_map * map) { + delete map; +} - // lm_head - // inpL shape [n_vocab,N*n_batch,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); +static bool ggml_is_view(struct ggml_tensor * t) { + return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || + t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; +} - { - // inpL shape [n_vocab,N,n_batch,1] - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); +static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { + switch (t->op) { + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + return t->src[0]; + case GGML_OP_CPY: + return t->src[1]; + default: + return NULL; } - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; } -struct ggml_tensor * forward_batch_wo_cache_flash_attn( - struct my_llama_model * model, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_batch) { - - const int n_past = 0; - const int N = n_tokens; - - const auto & hparams = model->hparams; - //const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); - memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); - - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // norm - { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); - assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); - - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * K = - ggml_permute(ctx0, - Kcur, - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * V = - ggml_permute(ctx0, - Vcur, - 0, 3, 1, 2); - assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); - - bool masked = true; - struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); - - // feed-forward network - { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } +static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { + struct ggml_tensor * parent = t; + do { + parent = get_view_parent(parent); + } while (ggml_is_view(parent)); + return parent; +} - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); +struct ggml_tensor * ggml_recompute_graph_node( + struct ggml_context * ctx, + struct ggml_cgraph * graph, + struct hash_map * replacements, + struct ggml_tensor * node) { - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); + if (node == NULL) { + return NULL; + } - // SILU activation - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); + if (node->is_param) { + return node; + } - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); + if (!hash_contains(graph->visited_hash_table, node)) { + return node; + } - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); + int count_children = 0; + for (int k = 0; k < GGML_MAX_SRC; ++k) { + if (node->src[k]) { + ++count_children; } - - cur = ggml_add_inplace(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); - - // input for next layer - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); } - // norm - { - - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - assert_shape_2d(inpL, n_embd, N*n_batch); + if (count_children == 0) { + return node; } - // lm_head - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); - - { - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); + size_t i = hash_find(replacements->keys, node); + GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + if (replacements->keys[i] == node) { + return (struct ggml_tensor *) replacements->vals[i]; } - // run the computation - ggml_build_forward_expand(gf, inpL); + struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); - return inpL; -} + // insert clone into replacements + GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite + replacements->keys[i] = node; + replacements->vals[i] = clone; -// expand the graph nodes without creating leafs. -struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) { - // check if already visited - for (int i = 0; i < g->n_nodes; i++) { - if (g->nodes[i] == t) { - return t; - } + clone->op = node->op; + clone->grad = node->grad; + clone->is_param = node->is_param; + clone->extra = node->extra; + for (int k = 0; k < GGML_MAX_DIMS; ++k) { + clone->nb[k] = node->nb[k]; } - - for (int i = 0; i < g->n_leafs; i++) { - if (g->leafs[i] == t) { - return t; - } + for (int k = 0; k < GGML_MAX_SRC; ++k) { + clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]); } - - for (int i = 0; i < GGML_MAX_SRC; ++i) { - if (t->src[i]) { - expand(g, t->src[i]); - } + if (ggml_is_view(clone)) { + struct ggml_tensor * source = get_view_source(clone); + GGML_ASSERT(source != NULL); + clone->data = source->data; } - GGML_ASSERT(g->n_nodes < GGML_MAX_NODES); + GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t))); + GGML_ASSERT(sizeof(node->name) == GGML_MAX_NAME); + memcpy(clone->op_params, node->op_params, sizeof(node->op_params)); + ggml_format_name(clone, "%s (clone)", ggml_get_name(node)); - if (strlen(t->name) == 0) { - snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes); - } - - g->nodes[g->n_nodes] = t; - g->grads[g->n_nodes] = t->grad; - g->n_nodes++; - return t; -} + return clone; +}; -void graph_set_leafs_grads(struct ggml_cgraph * g) { - // moves leaf nodes to g->leafs. - // i.e. g->n_nodes might change. - int n_nodes = 0; - for (int i = 0; i < g->n_nodes; ++i) { - struct ggml_tensor * node = g->nodes[i]; - const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL; - if (is_leaf) { - GGML_ASSERT(g->n_leafs < GGML_MAX_NODES); - - if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs); - } +void ggml_build_backward_gradient_checkpointing( + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, + struct ggml_tensor * * checkpoints, + int n_checkpoints) { + *gb_tmp = *gf; + ggml_build_backward_expand(ctx, gf, gb_tmp, true); + + if (n_checkpoints <= 0) { + *gb = *gb_tmp; + return; + } - g->leafs[g->n_leafs] = node; - g->n_leafs++; - } else { - GGML_ASSERT(n_nodes < GGML_MAX_NODES); + struct hash_map * replacements = new_hash_map(); - if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "node_%d", n_nodes); - } + // insert checkpoints in replacements + for (int i = 0; i < n_checkpoints; ++i) { + size_t k = hash_find(replacements->keys, checkpoints[i]); + GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite + replacements->keys[k] = checkpoints[i]; + replacements->vals[k] = checkpoints[i]; + } - g->nodes[n_nodes] = node; - g->grads[n_nodes] = node->grad; - n_nodes++; + *gb = *gf; + // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes], + // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]), + // by recomputing them from checkpoints + for (int i = gf->n_nodes; in_nodes; ++i) { + struct ggml_tensor * node = gb_tmp->nodes[i]; + for (int k = 0; k < GGML_MAX_SRC; ++k) { + // insert new tensors recomputing src, reusing already made replacements, + // remember replacements: remember new tensors with mapping from corresponding gf nodes + // recurse for input tensors, + // unless (i.e. terminating when) input tensors are checkpoints + node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]); } + // insert rewritten backward node with replacements made into resulting backward graph gb + ggml_build_forward_expand(gb, node); } - for (int i=n_nodes; i < g->n_nodes; ++i) { - g->nodes[n_nodes] = NULL; - g->grads[n_nodes] = NULL; - } - g->n_nodes = n_nodes; + + free_hash_map(replacements); } -struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( +struct ggml_tensor * llama_build_train_graphs( struct my_llama_model * model, - struct ggml_context * ctx0, + struct ggml_allocr * alloc, + struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, struct ggml_tensor * * logits, struct ggml_tensor * tokens_input, struct ggml_tensor * targets, - void * compute_buf_0, - void * compute_buf_1, - size_t size_buf_0, - size_t size_buf_1, const int n_tokens, - const int n_batch) { - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + const int n_batch, + const bool enable_flash_attn, + const bool enable_checkpointing) { + ggml_set_scratch(ctx, { 0, 0, nullptr, }); const int n_past = 0; const int N = n_tokens; - - gf->n_nodes = 0; - gf->n_leafs = 0; - gf->perf_runs = 0; - gf->perf_cycles = 0; - gf->perf_time_us = 0; - const auto & hparams = model->hparams; const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; @@ -1442,476 +667,162 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( const int n_layer = hparams.n_layer; const int n_head = hparams.n_head; const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - const int rope_mode = 0; - - int last_buf = -1; - size_t buf_offs[2] = { 0, 0 }; - size_t buf_size[2] = { size_buf_0, - size_buf_1 }; - void * buf_data[2] = { compute_buf_0, - compute_buf_1 }; - auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) { - size_t last_offs = 0; - last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - if (last_buf >= 0) { - buf_offs[last_buf] = last_offs; + const int n_ff = hparams.n_ff; + const float f_norm_rms_eps = hparams.f_norm_rms_eps; + const float rope_freq_base = hparams.rope_freq_base; + const float rope_freq_scale = hparams.rope_freq_scale; + + auto set_name = [](struct ggml_tensor * t, const char * n) { + ggml_set_name(t, n); + if (t->grad) { + ggml_format_name(t->grad, "%s->grad", n); } - if (buf >= 0) { - size_t offs = buf_offs[buf]; - size_t size = buf_size[buf]; - void * data = buf_data[buf]; - ggml_set_scratch(ctx0, { offs, size, data, }); - } - last_buf = buf; }; - bool track_max_mem = false; - size_t buf_maxs[2] = { 0, 0 }; - - auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) { - if (buf < 0) return; - if (track_max_mem) { - size_t last_offs = 0; - last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - if (last_buf >= 0) { - buf_offs[last_buf] = last_offs; - buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]); - } - } - buf_offs[buf] = 0; - if (track_max_mem && last_buf >= 0) { - size_t offs = buf_offs[last_buf]; - size_t size = buf_size[last_buf]; - void * data = buf_data[last_buf]; - ggml_set_scratch(ctx0, { offs, size, data, }); - } - }; - - - auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = n_embd/n_head; - int64_t ne1 = N; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = 0; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; + // rope has so much parameters that we make a custom function for it + auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale] + (struct ggml_tensor * t) -> struct ggml_tensor * { + // not capturing these, to silcence warnings + const int n_past = 0; + const int rope_mode = 0; - auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = n_embd/n_head; - int64_t ne1 = N; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = nb3*ne3; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); + return ggml_rope_custom(ctx, + t, n_past, n_rot, rope_mode, n_ctx, + rope_freq_base, rope_freq_scale); }; - auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = N; - int64_t ne1 = n_embd/n_head; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = 2*nb3*ne3; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; + set_name(tokens_input, "tokens_input"); + set_name(targets, "targets"); - auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * { - if (a == NULL) { - return b; - } else { - return ggml_add_inplace(ctx0, a, b); - } - }; + GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); + struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch); + struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch); - use_buf(-1); + struct ggml_tensor * cur = t01; - model->tok_embeddings->grad = NULL; - model->norm->grad = NULL; - model->output->grad = NULL; + std::vector checkpoints; + checkpoints.push_back(tokens_input); + checkpoints.push_back(targets); + checkpoints.push_back(t00); + checkpoints.push_back(t01); - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - layer.attention_norm->grad = NULL; - layer.wq->grad = NULL; - layer.wk->grad = NULL; - layer.wv->grad = NULL; - layer.wo->grad = NULL; - layer.ffn_norm->grad = NULL; - layer.w1->grad = NULL; - layer.w2->grad = NULL; - layer.w3->grad = NULL; + struct ggml_tensor * kv_scale; + if (!enable_flash_attn) { + kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); } - clr_buf(0); - clr_buf(1); - - use_buf(-1); - - struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch); - memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch); - - use_buf(-1); - - struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch); - - // need to remember these for the backward pass - std::vector t02L; t02L.resize(n_layer, NULL); - std::vector t03L; t03L.resize(n_layer, NULL); - std::vector t04L; t04L.resize(n_layer, NULL); - std::vector t05L; t05L.resize(n_layer, NULL); - std::vector t06L; t06L.resize(n_layer, NULL); - std::vector t07L; t07L.resize(n_layer, NULL); - std::vector t08L; t08L.resize(n_layer, NULL); - std::vector t09L; t09L.resize(n_layer, NULL); - std::vector t10L; t10L.resize(n_layer, NULL); - std::vector t11L; t11L.resize(n_layer, NULL); - std::vector t12L; t12L.resize(n_layer, NULL); - std::vector t13L; t13L.resize(n_layer, NULL); - std::vector t14L; t14L.resize(n_layer, NULL); - std::vector t15L; t15L.resize(n_layer, NULL); - std::vector t16L; t16L.resize(n_layer, NULL); - std::vector t17L; t17L.resize(n_layer, NULL); - std::vector t18L; t18L.resize(n_layer, NULL); - std::vector t19L; t19L.resize(n_layer, NULL); - std::vector t20L; t20L.resize(n_layer, NULL); - std::vector t21L; t21L.resize(n_layer, NULL); - std::vector t22L; t22L.resize(n_layer, NULL); - std::vector t23L; t23L.resize(n_layer, NULL); - std::vector t24L; t24L.resize(n_layer, NULL); - std::vector t25L; t25L.resize(n_layer, NULL); - std::vector t26L; t26L.resize(n_layer, NULL); - std::vector t27L; t27L.resize(n_layer, NULL); - std::vector t28L; t28L.resize(n_layer, NULL); - std::vector t29L; t29L.resize(n_layer, NULL); - std::vector t30L; t30L.resize(n_layer, NULL); - - struct ggml_tensor * cur = t01; - for (int il = 0; il < n_layer; ++il) { - clr_buf(0); struct my_llama_layer & layer = model->layers[il]; - // tensors with values necessary for backward pass are in persistent buf(-1) - // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed. - use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t02, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); - use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); - use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); - use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21, rms_norm_eps)); assert_shape_2d(t22, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); - use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); - t02L[il] = t02; - t03L[il] = t03; - t04L[il] = t04; - t05L[il] = t05; - t06L[il] = t06; - t07L[il] = t07; - t08L[il] = t08; - t09L[il] = t09; - t10L[il] = t10; - t11L[il] = t11; - t12L[il] = t12; - t13L[il] = t13; - t14L[il] = t14; - t15L[il] = t15; - t16L[il] = t16; - t17L[il] = t17; - t18L[il] = t18; - t19L[il] = t19; - t20L[il] = t20; - t21L[il] = t21; - t22L[il] = t22; - t23L[il] = t23; - t24L[il] = t24; - t25L[il] = t25; - t26L[il] = t26; - t27L[il] = t27; - t28L[il] = t28; - t29L[il] = t29; - t30L[il] = t30; - - cur = t30; - } - clr_buf(0); - use_buf(0); - struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t31, n_embd, N*n_batch); - struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); - struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); - use_buf(-1); - struct ggml_tensor * t34 = expand(gf, ggml_mul_mat (ctx0, model->output, t33)); assert_shape_2d(t34, n_vocab, N*n_batch); - struct ggml_tensor * t35 = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch)); assert_shape_3d(t35, n_vocab, N, n_batch); - struct ggml_tensor * t36 = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets)); assert_shape_1d(t36, 1); - - { - /* - tok_embeddings | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00) - L0_att_norm | grad_L0_att_norm = ggml_repeat_back(grad_t03L0, L0_att_norm.shape) - L0_wq | grad_L0_wq = ggml_out_prod(t04L0, grad_t05L0) - L0_wk | grad_L0_wk = ggml_out_prod(t04L0, grad_t08L0) - L0_wv | grad_L0_wv = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0)) - L0_wo | grad_L0_wo = ggml_out_prod(t19L0, grad_t20L0) - L0_ffn_norm | grad_L0_ffn_norm = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape) - L0_w1 | grad_L0_w1 = ggml_out_prod(t24L0, grad_t26L0) - L0_w2 | grad_L0_w2 = ggml_out_prod(t28L0, grad_t29L0) - L0_w3 | grad_L0_w3 = ggml_out_prod(t24L0, grad_t25L0) - L1_att_norm | grad_L1_att_norm = ggml_repeat_back(grad_t03L1, L1_att_norm.shape) - L1_wq | grad_L1_wq = ggml_out_prod(t04L1, grad_t05L1) - L1_wk | grad_L1_wk = ggml_out_prod(t04L1, grad_t08L1) - L1_wv | grad_L1_wv = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1)) - L1_wo | grad_L1_wo = ggml_out_prod(t19L1, grad_t20L1) - L1_ffn_norm | grad_L1_ffn_norm = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape) - L1_w1 | grad_L1_w1 = ggml_out_prod(t24L1, grad_t26L1) - L1_w2 | grad_L1_w2 = ggml_out_prod(t28L1, grad_t29L1) - L1_w3 | grad_L1_w3 = ggml_out_prod(t24L1, grad_t25L1) - norm | grad_norm = ggml_repeat_back(grad_t32, norm.shape) - output | grad_output = ggml_out_prod(t33, grad_t34) - | - t01 = ggml_get_rows(tok_embeddings, t00) | grad_t01 = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0) - for layer: | - t02L0*= ggml_rms_norm (t01) | grad_t02L0 = ggml_mul(grad_t04L0, t03L0) - t03L0 = ggml_repeat (L0_att_norm, t02L0_shape) | grad_t03L0 = ggml_mul(grad_t04L0, t02L0) - t04L0*= ggml_mul (t02L0, t03L0) | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0)) - t05L0 = ggml_mul_mat (L0_wq, t04L0) | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape) - t06L0 = ggml_reshape_4d (t05L0, n_embd/n_head, n_head, N, n_batch) | grad_t06L0 = ggml_rope_back(grad_t07L0) - t07L0 = ggml_rope_inplace (t06L0) | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3) - t08L0 = ggml_mul_mat (L0_wk, t04L0) | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape) - t09L0 = ggml_reshape_4d (t08L0, n_embd/n_head, n_head, N, n_batch) | grad_t09L0 = ggml_rope_back(grad_t10L0) - t10L0 = ggml_rope_inplace (t09L0) | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3) - t11L0 = ggml_mul_mat (t04L0, L0_wv) | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape) - t12L0 = ggml_reshape_4d (t11L0, N, n_batch, n_embd/n_head, n_head) | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1) - t13L0*= ggml_permute (t07L0, 0, 2, 1, 3) | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t14L0*= ggml_permute (t10L0, 0, 2, 1, 3) | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t15L0*= ggml_permute (t12L0, 0, 3, 1, 2) | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t16L0 = ggml_flash_attn (t13L0, t14L0, t15L0) | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3) - t17L0 = ggml_permute (t16L0, 0, 2, 1, 3) | grad_t17L0 = grad_t18L0 - t18L0 = ggml_cont (t17L0) | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape) - t19L0*= ggml_reshape_2d (t18L0, n_embd, N*n_batch) | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0)) - t20L0 = ggml_mul_mat (L0_wo, t19L0) | grad_t20L0 = grad_t21L0 - t21L0*= ggml_add (t20L0, t01) | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0) - t22L0*= ggml_rms_norm (t21L0) | grad_t22L0 = ggml_mul(grad_t24L0, t23L0) - t23L0 = ggml_repeat (L0_ffn_norm, t22L0_shape) | grad_t23L0 = ggml_mul(grad_t24L0, t22L0) - t24L0*= ggml_mul (t23L0, t22L0) | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0)) - t25L0*= ggml_mul_mat (L0_w3, t24L0) | grad_t25L0 = ggml_mul(grad_t28L0, t27L0) - t26L0*= ggml_mul_mat (L0_w1, t24L0) | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0) - t27L0*= ggml_silu (t26L0) | grad_t27L0 = ggml_mul(grad_t28L0, t25L0) - t28L0*= ggml_mul (t27L0, t25L0) | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0)) - t29L0 = ggml_mul_mat (L0_w2, t28L0) | grad_t29L0 = grad_t30L0 - t30L0*= ggml_add (t21L0, t29L0) | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1 - ^ - t02L1*= ggml_rms_norm (t30L0) | grad_t02L1 = ggml_mul(grad_t04L1, t03L1) - t03L1 = ggml_repeat (L1_att_norm, t02L1_shape) | grad_t03L1 = ggml_mul(grad_t04L1, t02L1) - t04L1*= ggml_mul (t02L1, t03L1) | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1)) - t05L1 = ggml_mul_mat (L1_wq, t04L1) | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape) - t06L1 = ggml_reshape_4d (t05L1, n_embd/n_head, n_head, N, n_batch) | grad_t06L1 = ggml_rope_back(grad_t07L1) - t07L1 = ggml_rope_inplace (t06L1) | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3) - t08L1 = ggml_mul_mat (L1_wk, t04L1) | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape) - t09L1 = ggml_reshape_4d (t08L1, n_embd/n_head, n_head, N, n_batch) | grad_t09L1 = ggml_rope_back(grad_t10L1) - t10L1 = ggml_rope_inplace (t09L1) | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3) - t11L1 = ggml_mul_mat (t04L1, L1_wv) | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape) - t12L1 = ggml_reshape_4d (t11L1, N, n_batch, n_embd/n_head, n_head) | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1) - t13L1*= ggml_permute (t07L1, 0, 2, 1, 3) | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t14L1*= ggml_permute (t10L1, 0, 2, 1, 3) | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t15L1*= ggml_permute (t12L1, 0, 3, 1, 2) | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t16L1 = ggml_flash_attn (t13L1, t14L1, t15L1) | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3) - t17L1 = ggml_permute (t16L1, 0, 2, 1, 3) | grad_t17L1 = grad_t18L1 - t18L1 = ggml_cont (t17L1) | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape) - t19L1*= ggml_reshape_2d (t18L1, n_embd, N*n_batch) | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1)) - t20L1 = ggml_mul_mat (L1_wo, t19L1) | grad_t20L1 = grad_t21L1 - t21L1*= ggml_add (t20L1, t30L0) | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1) - t22L1*= ggml_rms_norm (t21L1) | grad_t22L1 = ggml_mul(grad_t24L1, t23L1) - t23L1 = ggml_repeat (L1_ffn_norm, t22L1_shape) | grad_t23L1 = ggml_mul(grad_t24L1, t22L1) - t24L1*= ggml_mul (t23L1, t22L1) | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1)) - t25L1*= ggml_mul_mat (L1_w3, t24L1) | grad_t25L1 = ggml_mul(grad_t28L1, t27L1) - t26L1*= ggml_mul_mat (L1_w1, t24L1) | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1) - t27L1*= ggml_silu (t26L1) | grad_t27L1 = ggml_mul(grad_t28L1, t25L1) - t28L1*= ggml_mul (t27L1, t25L1) | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1)) - t29L1 = ggml_mul_mat (L1_w2, t28L1) | grad_t29L1 = grad_t30L1 - t30L1*= ggml_add (t21L1, t29L1) | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31) - ^ - t31 = ggml_rms_norm (t30L1) | grad_t31 = ggml_mul(grad_t33, t32) - t32 = ggml_repeat (norm, t31.shape) | grad_t32 = ggml_mul(grad_t33, t31) - t33 = ggml_mul (t32, t31) | grad_t33 = ggml_out_prod(output, ggml_transpose(grad_t34)) - t34 = ggml_mul_mat (output, t33) | grad_t34 = ggml_reshape(grad_t35, t34.shape) - t35 = ggml_reshape_3d (t34, n_vocab, N, n_batch) | grad_t35 = ggml_cross_entropy_loss_back(t35, targets, grad_t36) - t36 = ggml_cross_entropy_loss(t35, targets) | grad_t36 = 1 (optimizer) - tensors marked with * need to be stored until grad computation - tensors during grad computation are all temporary - */ - } - - *gb = *gf; - - // t36->grad gets set to one by optimizer, so we need the tensor. - // initialize it with 1.0f to make sure. - use_buf(-1); - t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); - - use_buf(0); - t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); - t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch)); assert_shape_2d(t34->grad, n_vocab, N*n_batch); - t33->grad = expand(gb, ggml_out_prod (ctx0, model->output, ggml_transpose(ctx0, t34->grad))); assert_shape_2d(t33->grad, n_embd, N*n_batch); - t32->grad = expand(gb, ggml_mul (ctx0, t33->grad, t31)); assert_shape_2d(t32->grad, n_embd, N*n_batch); - - use_buf(-1); - - model->norm->grad = expand(gb, add_or_set(model->norm->grad, ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd); - model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad))); assert_shape_2d(model->output->grad, n_embd, n_vocab); - - clr_buf(1); - use_buf(1); - t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32)); assert_shape_2d(t31->grad, n_embd, N*n_batch); + struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, f_norm_rms_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); + struct ggml_tensor * t03 = ggml_repeat (ctx, layer.attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); + struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch); + struct ggml_tensor * t05 = ggml_mul_mat (ctx, layer.wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch); + struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t08 = ggml_mul_mat (ctx, layer.wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd, N*n_batch); + struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t11 = ggml_mul_mat (ctx, t04, layer.wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd); + struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); + struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); + struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); + struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); + struct ggml_tensor * t16; + if (enable_flash_attn) { + t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); + } else { + struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); + struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); + struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch); + struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch); + t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); + } + struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch); + struct ggml_tensor * t20 = ggml_mul_mat (ctx, layer.wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch); + struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch); + struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); + struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); + struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); + struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); + cur = t30; + checkpoints.push_back(cur); + } + struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, f_norm_rms_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch); + struct ggml_tensor * t32 = ggml_repeat (ctx, model->norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch); + struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch); + struct ggml_tensor * t34 = ggml_mul_mat (ctx, model->output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch); + struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch); + struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1); + + checkpoints.push_back(t31); + checkpoints.push_back(t32); + checkpoints.push_back(t33); + checkpoints.push_back(t34); + checkpoints.push_back(t35); + checkpoints.push_back(t36); + + ggml_build_forward_expand(gf, t36); + + if (enable_checkpointing) { + ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); + } else { + *gb = *gf; + ggml_build_backward_expand(ctx, gf, gb, true); + } + + if (alloc) { + // make sure some tensors are not reallocated by inserting new temporary nodes depending on them + int n_leafs_before = gb->n_leafs; + int n_nodes_before = gb->n_nodes; + struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f); + // output tensors + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); + // input gradient + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); + GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad)); + ggml_allocr_alloc(alloc, t36->grad); + // gradient tensors (will be set to zero by ggml_graph_reset) + // pinning these produces large unnecessary memory overhead, which will be resolved by PR 2632 + for (int i = 0; i < gf->n_nodes; ++i) { + if (!gf->grads[i]) continue; + if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) { + ggml_allocr_alloc(alloc, gf->grads[i]); + } + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one)); + } + // allocating checkpoints in one block to reduce memory fragmentation + // note: they will be freed in reverse order + for (int i = 0; i < (int) checkpoints.size(); ++i) { + if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) { + ggml_allocr_alloc(alloc, checkpoints[i]); + } + } - struct ggml_tensor * back_layer_inp = t31; - struct ggml_tensor * grad_layer_inp = NULL; + //int n_leafs_after = gb->n_leafs; + //int n_nodes_after = gb->n_nodes; - for (int k = 0; k < n_layer; ++k) { - int il = n_layer-1-k; - struct my_llama_layer & layer = model->layers[il]; + ggml_allocr_alloc_graph(alloc, gb); - struct ggml_tensor * t02 = t02L[il]; - struct ggml_tensor * t03 = t03L[il]; - struct ggml_tensor * t04 = t04L[il]; - struct ggml_tensor * t05 = t05L[il]; - struct ggml_tensor * t06 = t06L[il]; - struct ggml_tensor * t07 = t07L[il]; - struct ggml_tensor * t08 = t08L[il]; - struct ggml_tensor * t09 = t09L[il]; - struct ggml_tensor * t10 = t10L[il]; - struct ggml_tensor * t11 = t11L[il]; - struct ggml_tensor * t12 = t12L[il]; - struct ggml_tensor * t13 = t13L[il]; - struct ggml_tensor * t14 = t14L[il]; - struct ggml_tensor * t15 = t15L[il]; - struct ggml_tensor * t16 = t16L[il]; - struct ggml_tensor * t17 = t17L[il]; - struct ggml_tensor * t18 = t18L[il]; - struct ggml_tensor * t19 = t19L[il]; - struct ggml_tensor * t20 = t20L[il]; - struct ggml_tensor * t21 = t21L[il]; - struct ggml_tensor * t22 = t22L[il]; - struct ggml_tensor * t23 = t23L[il]; - struct ggml_tensor * t24 = t24L[il]; - struct ggml_tensor * t25 = t25L[il]; - struct ggml_tensor * t26 = t26L[il]; - struct ggml_tensor * t27 = t27L[il]; - struct ggml_tensor * t28 = t28L[il]; - struct ggml_tensor * t29 = t29L[il]; - struct ggml_tensor * t30 = t30L[il]; - - clr_buf(0); - use_buf(0); - t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); - if (grad_layer_inp) { - t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); + // remove the additional nodes and leafs + for (int i = n_leafs_before; i < gb->n_leafs; ++i) { + gb->leafs[i] = NULL; + } + for (int i = n_nodes_before; i < gb->n_nodes; ++i) { + gb->nodes[i] = NULL; } - clr_buf(1); - t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); - t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad))); assert_shape_2d(t28->grad, n_ff, N*n_batch); - t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25)); assert_shape_2d(t27->grad, n_ff, N*n_batch); - t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad)); assert_shape_2d(t26->grad, n_ff, N*n_batch); - t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27)); assert_shape_2d(t25->grad, n_ff, N*n_batch); - t24->grad = expand(gb, ggml_add_inplace(ctx0, - ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)), - ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad)))); assert_shape_2d(t24->grad, n_embd, N*n_batch); - t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22)); assert_shape_2d(t23->grad, n_embd, N*n_batch); - t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad))); assert_shape_2d(t22->grad, n_embd, N*n_batch); - use_buf(1); - t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad))); assert_shape_2d(t21->grad, n_embd, N*n_batch); - grad_layer_inp = t21; - use_buf(0); - t20->grad = t21->grad; assert_shape_2d(t20->grad, n_embd, N*n_batch); - t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad))); assert_shape_2d(t19->grad, n_embd, N*n_batch); - t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); - t17->grad = t18->grad; assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch); - t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3)); assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch); - struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true)); assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch); - t15->grad = expand(gb, view__v(flash_attn)); assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch); - t14->grad = expand(gb, view__k(flash_attn)); assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch); - t13->grad = expand(gb, view__q(flash_attn)); assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch); - t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); - t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd); - t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); - t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); - t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch); - t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); - t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); - t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch); - t04->grad = expand(gb, ggml_add_inplace(ctx0, - ggml_add_inplace(ctx0, - ggml_out_prod(ctx0, layer.wv, t11->grad), - ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))), - ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad)))); assert_shape_2d(t04->grad, n_embd, N*n_batch); - t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); - use_buf(1); - t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02))); assert_shape_2d(t02->grad, n_embd, N*n_batch); - back_layer_inp = t02; - // use_buf(0); - - use_buf(-1); - layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm))); assert_shape_1d(layer.attention_norm->grad, n_embd); - layer.wq->grad = expand(gb, add_or_set(layer.wq->grad, ggml_out_prod(ctx0, t04, t05->grad))); assert_shape_2d(layer.wq->grad, n_embd, n_embd); - layer.wk->grad = expand(gb, add_or_set(layer.wk->grad, ggml_out_prod(ctx0, t04, t08->grad))); assert_shape_2d(layer.wk->grad, n_embd, n_embd); - layer.wv->grad = expand(gb, add_or_set(layer.wv->grad, ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad, n_embd, n_embd); - layer.wo->grad = expand(gb, add_or_set(layer.wo->grad, ggml_out_prod(ctx0, t19, t20->grad))); assert_shape_2d(layer.wo->grad, n_embd, n_embd); - layer.ffn_norm->grad = expand(gb, add_or_set(layer.ffn_norm->grad, ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm))); assert_shape_1d(layer.ffn_norm->grad, n_embd); - layer.w1->grad = expand(gb, add_or_set(layer.w1->grad, ggml_out_prod(ctx0, t24, t26->grad))); assert_shape_2d(layer.w1->grad, n_embd, n_ff); - layer.w2->grad = expand(gb, add_or_set(layer.w2->grad, ggml_out_prod(ctx0, t28, t29->grad))); assert_shape_2d(layer.w2->grad, n_ff, n_embd); - layer.w3->grad = expand(gb, add_or_set(layer.w3->grad, ggml_out_prod(ctx0, t24, t25->grad))); assert_shape_2d(layer.w3->grad, n_embd, n_ff); - // use_buf(0); + gb->n_leafs = n_leafs_before; + gb->n_nodes = n_nodes_before; } - clr_buf(0); - use_buf(0); - t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch); - use_buf(-1); - model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); - // clr_buf(1); - // clr_buf(0); *logits = t35; - - if (track_max_mem) { - printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]); - printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]); - } - - // now that all grads are created, set the graph leafs and grads - graph_set_leafs_grads(gf); - graph_set_leafs_grads(gb); - return t36; } @@ -1959,43 +870,7 @@ void print_matrix(struct ggml_tensor * probs) { } } - -void print_token(struct llama_context * ctx, llama_token token) { - printf("%s", llama_token_to_str(ctx, token)); -} - -void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { - for (int i=0; ine[0]; ++i) { - int token = ggml_get_i32_1d(tokens, i); - print_token(ctx, token); - } -} - -void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) { - for (int i1=0; i1ne[1]; ++i1) { - //int num_newline = 0; - for (int i0=0; i0ne[0]; ++i0) { - int token = get_i32_2d(tokens, i0, i1); - print_token(ctx, token); - // bool isnl = (token == llama_token_nl()); - // if (isnl) { - // ++num_newline; - // } - // if (isnl) { - // if (num_newline < 2) { - // print_token(ctx, token); - // } else { - // printf("\\n"); - // } - // } else { - // print_token(ctx, token); - // } - } - printf("\n--\n"); - } -} - -void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { +void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { int n_tokens = tokens_input->ne[0]; int n_vocab = target_logits->ne[0]; @@ -2004,7 +879,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons ggml_set_f32(target_logits, -1.0f/n_vocab); ggml_set_f32(target_probs, 0.0f); - ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); + ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx)); for (int i=1; in_dims == 2); GGML_ASSERT(target_logits->n_dims == 3); GGML_ASSERT(target_probs->n_dims == 3); @@ -2030,187 +905,90 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai ggml_set_f32(target_logits, -1.0f/n_vocab); ggml_set_f32(target_probs, 0.0f); + // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); for (int k=0; kne[0]; - int n_vocab = target_logits->ne[0]; - for (int i=0; i= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); - } - if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); + if (i& out) { + FILE * fp = std::fopen(filename, "rb"); + if (fp == NULL) { + return 0; } - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; +#ifdef _WIN32 + GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_END) == 0); +#else + GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_END) == 0); +#endif -int tokenize_file(struct llama_context * lctx, const char * filename, std::vector& out) { - struct llama_file f(filename, "rb"); + size_t size = 0; +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); + size = ret; +#else + long ret = std::ftell(fp); + size = ret; +#endif + +#ifdef _WIN32 + GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_SET) == 0); +#else + GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_SET) == 0); +#endif std::vector buf; - buf.resize(f.size+1); + buf.resize(size+1); + out.resize(size+1); - f.read_raw(buf.data(), f.size); - buf[f.size] = '\0'; + if (std::fread(buf.data(), size, 1, fp) != 1) { + die("unexpectedly reached end of file"); + } + if (ferror(fp)) { + die_fmt("fread failed: %s", strerror(errno)); + } - out.resize(buf.size()); + buf[size] = '\0'; - int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false); - if (n_tokens >= 0) { - out.resize(n_tokens); + int n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false); + if (n_tokens < 0) { + out.resize(-n_tokens); + n_tokens = llama_tokenize(lctx, buf.data(), buf.size(), out.data(), out.size(), false); } + GGML_ASSERT(n_tokens >= 0); + out.resize(n_tokens); bool verify = false; if (verify) { const char * in = buf.data(); const char * end = buf.data() + buf.size(); for (int i = 0; i < (int) out.size(); ++i) { - const char * s = llama_token_to_str(lctx, out[i]); - int len = strlen(s); + std::string s = llama_token_to_piece(lctx, out[i]); + int len = s.length(); if (in >= end) { printf("%s: unexpected end of original text.\n", __func__); break; } - const bool matches = (strncmp(in, s, len) == 0); + const bool matches = (strncmp(in, s.c_str(), len) == 0); if (matches) { in += len; } else { - printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s); + printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str()); } } } @@ -2236,435 +1014,466 @@ void shuffle_ints(int * begin, int * end) { }); } -struct my_llama_sampler_params { - float temp = 0.0f; // <= 0.0 disabled - int top_k = 20; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - int repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float repeat_penalty = 1.0f; // 1.0 = disabled - float alpha_presence = 0.0f; // 0.0 = disabled - float alpha_frequency = 0.0f; // 0.0 = disabled - int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token -}; +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + die_fmt("key not found in model: %s", skey.c_str()); \ + } \ +} -struct my_llama_sampler { - struct llama_context * ctx = NULL; - my_llama_sampler_params params; - int n_vocab = 0; - int n_ctx = 0; +bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(a != NULL); + GGML_ASSERT(b != NULL); + GGML_ASSERT(a->type == b->type); + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); - float mirostat_mu; + return true; +} - std::vector candidates; - llama_token_data_array candidates_p; +void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { + if (dst == NULL) { + return; + } + struct ggml_tensor * t = ggml_get_tensor(ctx, name); + GGML_ASSERT(are_same_layout(dst, t)); + memcpy(dst->data, t->data, ggml_nbytes(t)); -}; + if (strlen(ggml_get_name(dst)) == 0) { + ggml_set_name(dst, name); + } +} -void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) { - sampler->ctx = ctx; - sampler->n_vocab = llama_n_vocab(sampler->ctx); - sampler->n_ctx = llama_n_ctx(sampler->ctx); - sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau; +void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); + GGML_ASSERT(file_version == 0); + + GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); + GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); + GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); + + uint64_t nx; + GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); + opt->nx = (size_t) nx; + + // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know + + std::string opt_type; + GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); + if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { + opt->params.type = GGML_OPT_ADAM; + + GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); + GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); + + GGML_ASSERT(opt->ctx != NULL); + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + read_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + read_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { + opt->params.type = GGML_OPT_LBFGS; + + GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); + GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); + GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); + GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); + GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); + GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); + + GGML_ASSERT(opt->ctx != NULL); + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + read_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + read_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + read_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + read_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + read_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + read_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + } else { + die("unknown optimizer type"); + } } -llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) { - GGML_ASSERT(sampler->ctx != NULL); +void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); + gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); + gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); - struct llama_context * ctx = sampler->ctx; + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); + + ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + if (opt->adam.pf) { + ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } - sampler->candidates.resize(sampler->n_vocab); - for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) { - sampler->candidates[token_id].id = token_id; - sampler->candidates[token_id].logit = logits[token_id]; - sampler->candidates[token_id].p = 0.0; + gguf_add_tensor(fctx, opt->adam.m); + gguf_add_tensor(fctx, opt->adam.v); + if (opt->adam.pf) { + gguf_add_tensor(fctx, opt->adam.pf); + } + } break; + case GGML_OPT_LBFGS: + { + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); + + ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + if (opt->lbfgs.pf) { + ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + } + ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + + gguf_add_tensor(fctx, opt->lbfgs.x); + gguf_add_tensor(fctx, opt->lbfgs.xp); + gguf_add_tensor(fctx, opt->lbfgs.g); + gguf_add_tensor(fctx, opt->lbfgs.gp); + gguf_add_tensor(fctx, opt->lbfgs.d); + if (opt->lbfgs.pf) { + gguf_add_tensor(fctx, opt->lbfgs.pf); + } + gguf_add_tensor(fctx, opt->lbfgs.lmal); + gguf_add_tensor(fctx, opt->lbfgs.lmys); + gguf_add_tensor(fctx, opt->lbfgs.lms); + gguf_add_tensor(fctx, opt->lbfgs.lmy); + } break; } +} - llama_token_data_array * candidates_p = & sampler->candidates_p; +void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + std::string arch; - candidates_p->data = sampler->candidates.data(); - candidates_p->size = sampler->candidates.size(); - candidates_p->sorted = false; + std::vector keybuf; + keybuf.resize(512); + auto kv = [&arch, &keybuf](const char * key) -> const char * { + snprintf(keybuf.data(), keybuf.size(), key, arch.c_str()); + return keybuf.data(); + }; - const auto params = sampler->params; + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + auto tn = [&tn_buf](const char * key) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); + return tn_buf.data(); + }; + auto tni = [&tn_buf](const char * key, int bid) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); + return tn_buf.data(); + }; - // Apply penalties - const float nl_logit = logits[llama_token_nl()]; + GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); + GGML_ASSERT(arch == "llama"); - const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx); + uint32_t ftype_u; + GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); + GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); - llama_sample_repetition_penalty( - ctx, - candidates_p, - last_tokens + n_last_tokens - n_last, - n_last, - params.repeat_penalty); - llama_sample_frequency_and_presence_penalties( - ctx, - candidates_p, - last_tokens + n_last_tokens - n_last, - n_last, - params.alpha_frequency, - params.alpha_presence); + // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here + GGUF_GET_KEY(fctx, model->hparams.n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); - if (!params.penalize_nl) { - logits[llama_token_nl()] = nl_logit; - } + GGUF_GET_KEY(fctx, model->hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); + GGUF_GET_KEY(fctx, model->hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); + GGUF_GET_KEY(fctx, model->hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); + GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); - llama_token token = 0; - if (params.temp <= 0) { - // Greedy sampling - token = llama_sample_token_greedy(ctx, candidates_p); - } else { - if (params.mirostat == 1) { - int mirostat_m = 100; - llama_sample_temperature(ctx, candidates_p, params.temp); - token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu); - } else if (params.mirostat == 2) { - llama_sample_temperature(ctx, candidates_p, params.temp); - token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k (ctx, candidates_p, params.top_k, 1); - llama_sample_tail_free (ctx, candidates_p, params.tfs_z, 1); - llama_sample_typical (ctx, candidates_p, params.typical_p, 1); - - llama_sample_top_p (ctx, candidates_p, params.top_p, 1); - llama_sample_temperature (ctx, candidates_p, params.temp); - token = llama_sample_token(ctx, candidates_p); - } - } - return token; -} + model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head; + GGUF_GET_KEY(fctx, model->hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); -void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, float value) { - GGML_ASSERT(logits->ne[0] == (int64_t) mask.size()); - for (int i2 = 0; i2 < logits->ne[2]; ++i2) { - for (int i1 = 0; i1 < logits->ne[1]; ++i1) { - for (int i0 = 0; i0 < logits->ne[0]; ++i0) { - if (!mask[i0]) continue; - float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]); - *ptr = value; - } - } + float rope_freq_scale = 1.0f; + GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + if (rope_freq_scale != 1.0f) { + model->hparams.rope_freq_scale = 1.0f / rope_freq_scale; } -} -void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - return; + init_model(model); + + read_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); + read_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); + read_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); + + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + read_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); + read_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); + read_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); + read_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); + read_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); + read_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); + read_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); + read_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); + read_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); } - const char * name = ggml_get_name(tensor); - uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->write_raw(tensor->data, ggml_nbytes(tensor)); } -void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - int32_t nd = file->read_u32(); - GGML_ASSERT(nd == tensor->n_dims); +void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) { + const char * arch = "llama"; + enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; - uint32_t name_len = file->read_u32(); - enum ggml_type type = (enum ggml_type) file->read_u32(); - GGML_ASSERT(type == tensor->type); + std::vector keybuf; + keybuf.resize(512); + auto kv = [arch, &keybuf](const char * key) -> const char * { + snprintf(keybuf.data(), keybuf.size(), key, arch); + return keybuf.data(); + }; - uint32_t ne[4]; - file->read_raw(ne, sizeof(ne[0]) * nd); - for (int i=0; ine[i]); - } + // set arch + gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); + gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); - std::string name = file->read_string(name_len); - GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0); + // set hparams + gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx ); + gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd ); + gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff ); + gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head ); + gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer ); + gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_rot ); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->read_raw(tensor->data, ggml_nbytes(tensor)); -} + gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps ); + gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base ); // TODO load in llama.cpp + gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), 1.0f / model->hparams.rope_freq_scale ); -void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) { - const uint32_t version = 0; - GGML_ASSERT(opt->nx >= 0); - GGML_ASSERT(opt->iter >= 0); - file->write_u32(version); - file->write_raw(&opt->params, sizeof(opt->params)); - file->write_raw(&opt->nx, sizeof(opt->nx)); - file->write_raw(&opt->iter, sizeof(opt->iter)); - file->write_u32((uint32_t) opt->just_initialized); - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - GGML_ASSERT(opt->adam.x != NULL); - write_tensor(file, opt->adam.x); - write_tensor(file, opt->adam.g1); - write_tensor(file, opt->adam.g2); - write_tensor(file, opt->adam.m); - write_tensor(file, opt->adam.v); - write_tensor(file, opt->adam.mh); - write_tensor(file, opt->adam.vh); - write_tensor(file, opt->adam.pf); - file->write_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); - file->write_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); - file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); - } break; - case GGML_OPT_LBFGS: - { - GGML_ASSERT(opt->adam.x != NULL); - write_tensor(file, opt->lbfgs.x); - write_tensor(file, opt->lbfgs.xp); - write_tensor(file, opt->lbfgs.g); - write_tensor(file, opt->lbfgs.gp); - write_tensor(file, opt->lbfgs.d); - write_tensor(file, opt->lbfgs.pf); - write_tensor(file, opt->lbfgs.lmal); - write_tensor(file, opt->lbfgs.lmys); - write_tensor(file, opt->lbfgs.lms); - write_tensor(file, opt->lbfgs.lmy); - file->write_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); - file->write_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); - file->write_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); - file->write_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); - file->write_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); - file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); - } break; - } -} + // set vocab by copying from vocab_model gguf file + { + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ NULL, + }; + struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params); -void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) { - uint32_t version = file->read_u32(); - GGML_ASSERT(version == 0); + const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST)); + if (token_idx == -1) { + die("cannot find tokenizer vocab in model file"); + } + const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx); - file->read_raw(&opt->params, sizeof(opt->params)); - file->read_raw(&opt->nx, sizeof(opt->nx)); - ggml_opt_init(ctx, opt, opt->params, opt->nx); + const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES)); + if (score_idx == -1) { + die("cannot find tokenizer scores in model file"); + } - file->read_raw(&opt->iter, sizeof(opt->iter)); - opt->just_initialized = (bool) file->read_u32(); + const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx); - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - read_tensor(file, opt->adam.x); - read_tensor(file, opt->adam.g1); - read_tensor(file, opt->adam.g2); - read_tensor(file, opt->adam.m); - read_tensor(file, opt->adam.v); - read_tensor(file, opt->adam.mh); - read_tensor(file, opt->adam.vh); - if (opt->adam.pf) { read_tensor(file, opt->adam.pf); } - file->read_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); - file->read_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); - file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); - } break; - case GGML_OPT_LBFGS: - { - GGML_ASSERT(opt->adam.x != NULL); - read_tensor(file, opt->lbfgs.x); - read_tensor(file, opt->lbfgs.xp); - read_tensor(file, opt->lbfgs.g); - read_tensor(file, opt->lbfgs.gp); - read_tensor(file, opt->lbfgs.d); - if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); } - read_tensor(file, opt->lbfgs.lmal); - read_tensor(file, opt->lbfgs.lmys); - read_tensor(file, opt->lbfgs.lms); - read_tensor(file, opt->lbfgs.lmy); - file->read_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); - file->read_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); - file->read_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); - file->read_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); - file->read_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); - file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); - } break; - } -} + const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE)); + if (toktype_idx == -1) { + die("cannot find token type list in GGUF file"); + } -void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) { - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } + const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx); + + std::string tokenizer_name; + GGUF_GET_KEY(vctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL)); + + gguf_set_val_str(fctx, kv(LLM_KV_TOKENIZER_MODEL), tokenizer_name.c_str()); + gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_SCORES), GGUF_TYPE_FLOAT32, scores, n_vocab); + gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE), GGUF_TYPE_INT32, toktypes, n_vocab); + + int32_t special_bos_id = 1; + int32_t special_eos_id = 2; + int32_t special_unk_id = 0; + int32_t special_sep_id = -1; + int32_t special_pad_id = -1; + if (tokenizer_name == "llama") { + // default special tokens + special_bos_id = 1; + special_eos_id = 2; + special_unk_id = 0; + special_sep_id = -1; + special_pad_id = -1; + } else if (tokenizer_name == "gpt2") { + // read and copy bpe merges + const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES)); + if (merges_keyidx == -1) { + die("cannot find tokenizer merges in model file"); + } + + const int n_merges = gguf_get_arr_n(vctx, merges_keyidx); + + std::vector merges; + merges.resize(n_merges); + for (int i = 0; i < n_merges; i++) { + merges[i] = gguf_get_arr_str(vctx, merges_keyidx, i); + } + gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_MERGES), merges.data(), n_merges); + + // default special tokens + special_bos_id = 11; + special_eos_id = 11; + special_unk_id = -1; + special_sep_id = -1; + special_pad_id = -1; + } else { + fprintf(stderr, "%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); + fprintf(stderr, "%s: using default tokenizer: 'llama'", __func__); + } - const uint32_t magic = 'ggcp'; - const uint32_t version = 0; - - file.write_u32(magic); - file.write_u32(version); - file.write_u32(model->train_its); - file.write_u32(model->train_samples); - file.write_u32(model->train_tokens); - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); + std::vector tokens; + tokens.resize(n_vocab); + for (uint32_t i = 0; i < n_vocab; i++) { + tokens[i] = gguf_get_arr_str(vctx, token_idx, i); + } + gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_LIST), tokens.data(), n_vocab); + + GGUF_GET_KEY(vctx, special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); + GGUF_GET_KEY(vctx, special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID)); + GGUF_GET_KEY(vctx, special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID)); + GGUF_GET_KEY(vctx, special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID)); + GGUF_GET_KEY(vctx, special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID)); + + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_BOS_ID), special_bos_id); + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_EOS_ID), special_eos_id); + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_UNK_ID), special_unk_id); + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_SEP_ID), special_sep_id); + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_PAD_ID), special_pad_id); + + gguf_free(vctx); + } + // add tensors + gguf_add_tensor(fctx, model->tok_embeddings); + gguf_add_tensor(fctx, model->norm); + gguf_add_tensor(fctx, model->output); for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); + + gguf_add_tensor(fctx, layer.attention_norm); + gguf_add_tensor(fctx, layer.wq); + gguf_add_tensor(fctx, layer.wk); + gguf_add_tensor(fctx, layer.wv); + gguf_add_tensor(fctx, layer.wo); + gguf_add_tensor(fctx, layer.ffn_norm); + gguf_add_tensor(fctx, layer.w1); + gguf_add_tensor(fctx, layer.w2); + gguf_add_tensor(fctx, layer.w3); } +} + +void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) { + struct gguf_context * fctx = gguf_init_empty(); - write_opt_context(&file, opt); + save_llama_model_gguf(fctx, fn_vocab_model, model); + + // write file + const bool only_meta = false; + gguf_write_to_file(fctx, filename, only_meta); + gguf_free(fctx); } -bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) { - struct llama_file file(filename, "rb"); +void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) { + load_llama_model_gguf(fctx, f_ggml_ctx, model); - uint32_t magic; - uint32_t version; + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION); + GGML_ASSERT(file_version == 0); - uint32_t train_its = 0; - uint32_t train_samples = 0; - uint32_t train_tokens = 0; + GGUF_GET_KEY(fctx, model->train_its, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT); + GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT); + GGUF_GET_KEY(fctx, model->train_tokens, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT); - if (file.fp) { - printf("%s: Loading model from '%s'.\n", __func__, filename); - magic = file.read_u32(); - GGML_ASSERT(magic == 'ggcp'); - version = file.read_u32(); - GGML_ASSERT(version == 0); - train_its = file.read_u32(); - train_samples = file.read_u32(); - train_tokens = file.read_u32(); - model->hparams.n_vocab = file.read_u32(); - model->hparams.n_embd = file.read_u32(); - model->hparams.n_mult = file.read_u32(); - model->hparams.n_head = file.read_u32(); - model->hparams.n_layer = file.read_u32(); - model->hparams.n_rot = file.read_u32(); - print_params(&model->hparams); - } + load_opt_context_gguf(fctx, f_ggml_ctx, opt); +} - if (init) { - init_model(model); - } +void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) { + save_llama_model_gguf(fctx, fn_vocab_model, model); - if (file.fp) { - model->train_its = train_its; - model->train_samples = train_samples; - model->train_tokens = train_tokens; - } + gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 0); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT, model->train_samples); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT, model->train_tokens); - printf("%s: Training iterations: %u.\n", __func__, model->train_its); - printf("%s: Training samples: %u.\n", __func__, model->train_samples); - printf("%s: Training tokens: %u.\n", __func__, model->train_tokens); - - if (file.fp) { - read_tensor(&file, model->tok_embeddings); - read_tensor(&file, model->norm); - read_tensor(&file, model->output); - - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - read_tensor(&file, layer.attention_norm); - read_tensor(&file, layer.wq); - read_tensor(&file, layer.wk); - read_tensor(&file, layer.wv); - read_tensor(&file, layer.wo); - read_tensor(&file, layer.ffn_norm); - read_tensor(&file, layer.w1); - read_tensor(&file, layer.w2); - read_tensor(&file, layer.w3); - } + save_opt_context_gguf(fctx, opt); +} - read_opt_context(&file, model->ctx, opt); +bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) { + struct ggml_context * f_ggml_ctx; + struct gguf_init_params params; + params.no_alloc = false; + params.ctx = &f_ggml_ctx; + struct gguf_context * fctx = gguf_init_from_file(filename, params); + if (fctx == NULL) { + return false; } - return (file.fp != NULL); + load_checkpoint_gguf(fctx, f_ggml_ctx, model, opt); + + return true; } -void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) { - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } +void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) { + struct gguf_context * fctx = gguf_init_empty(); - // write_magic - file.write_u32(LLAMA_FILE_MAGIC); // magic - file.write_u32(LLAMA_FILE_VERSION); // version - // write_hparams - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - file.write_u32(LLAMA_FTYPE_ALL_F32); - // write_vocab - uint32_t n_vocab = model->hparams.n_vocab; - for (uint32_t i = 0; i < n_vocab; i++) { - const auto & token_score = vocab->id_to_token.at(i); - file.write_u32((uint32_t) token_score.tok.size()); - file.write_raw(token_score.tok.data(), token_score.tok.size()); - file.write_raw(&token_score.score, sizeof(token_score.score)); - } - // write tensors - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; + save_checkpoint_gguf(fctx, fn_vocab_model, model, opt); - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); - } + // write file + const bool only_meta = false; + gguf_write_to_file(fctx, filename, only_meta); + gguf_free(fctx); } -float cosine_decay(const int decay_steps, const float alpha, int step) { +float cosine_decay(const int decay_steps, const float minimum, int step) { if (step > decay_steps) { step = decay_steps; } const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); - const float decay = (1 - alpha)*cosine_decay + alpha; + const float decay = (1 - minimum)*cosine_decay + minimum; return decay; } -float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult) { - while (step > decay_steps) { - step -= decay_steps; - decay_steps = (int) restart_step_mult * decay_steps; +float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) { + if (enable_restart) { + while (step > decay_steps) { + step -= decay_steps; + decay_steps = (int) restart_step_mult * decay_steps; + } } - return cosine_decay(decay_steps, alpha, step); + return cosine_decay(decay_steps, minimum, step); } struct train_params { @@ -2678,39 +1487,51 @@ struct train_params { int n_ctx; int n_embd; - int n_mult; int n_head; int n_layer; - int n_rotmax; + int n_ff; int n_threads; int n_batch; int n_examples; - int n_predict; + + float f_norm_rms_eps; + float rope_freq_base; + float rope_freq_scale; int print_info_interval; - int print_details_interval; bool samples_start_after_nl; bool use_adam; bool use_flash; - bool use_scratch; + bool use_checkpointing; + bool use_alloc; // only adam int warmup; int cos_decay_steps; float cos_decay_restart; - float cos_decay_alpha; + float cos_decay_min; + bool enable_restart; + + int opt_past; + float opt_delta; + int opt_max_no_improvement; int lbfgs_n_iter; int adam_n_iter; float adam_alpha; + float adam_min_alpha; float adam_decay; + int adam_decay_min_ndim; + float adam_beta1; + float adam_beta2; + float adam_gclip; + float adam_eps_f; int mem_model_gb; int mem_compute_gb; int mem_compute0_gb; - int mem_compute1_gb; }; struct train_params get_default_train_params() { @@ -2725,40 +1546,51 @@ struct train_params get_default_train_params() { params.n_ctx = 128; params.n_embd = 256; - params.n_mult = 256; params.n_head = 8; params.n_layer = 16; - params.n_rotmax = 64; + params.n_ff = 768; params.n_threads = 6; params.n_batch = 8; - params.n_examples = 8; - params.n_predict = 1024; + params.n_examples = 1; + + params.f_norm_rms_eps = 1e-5; + params.rope_freq_base = 10000.0f; + params.rope_freq_scale = 1.0f; params.print_info_interval = 1; - params.print_details_interval = 2; params.samples_start_after_nl = false; params.use_adam = true; params.use_flash = true; - params.use_scratch = true; + params.use_checkpointing = true; + params.use_alloc = true; + + params.opt_past = 0; + params.opt_delta = 1e-5f; + params.opt_max_no_improvement = 0; // only adam params.warmup = 100; params.cos_decay_steps = 1000; params.cos_decay_restart = 1.1f; - params.cos_decay_alpha = 0.0f; - - params.lbfgs_n_iter = 16; - params.adam_n_iter = 16; - params.adam_alpha = 1e-3f; - params.adam_decay = 1e-3f; - - params.mem_model_gb = 2; + params.cos_decay_min = 0.1f; + params.enable_restart = false; + + params.lbfgs_n_iter = 256; + params.adam_n_iter = 256; + params.adam_alpha = 1e-3f; + params.adam_min_alpha = 0; + params.adam_decay = 1e-1f; + params.adam_decay_min_ndim = 2; + params.adam_beta1 = 0.9f; + params.adam_beta2 = 0.999f; + params.adam_gclip = 1.0f; + params.adam_eps_f = 0.0f; + + params.mem_model_gb = 2; params.mem_compute_gb = 24; params.mem_compute0_gb = 8; - params.mem_compute1_gb = 2; - return params; } @@ -2775,35 +1607,47 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n"); fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd); - fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult); + fprintf(stderr, " --ff N Feedforward size used for new models. (default %d)\n", params->n_ff); fprintf(stderr, " --head N Number of heads for new models (default %d)\n", params->n_head); fprintf(stderr, " --layer N Number of layers for new models (default %d)\n", params->n_layer); - fprintf(stderr, " --rotmax N Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax); + fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); + fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); + fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); fprintf(stderr, " -n N, --examples N Number of examples to train (default %d)\n", params->n_examples); - fprintf(stderr, " --predict N Number of tokens to generate after training (default %d)\n", params->n_predict); fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval); - fprintf(stderr, " --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval); fprintf(stderr, " --samples-after-nl Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off"); fprintf(stderr, " --use-lbfgs Use LBFGS optimizer instead of default Adam\n"); fprintf(stderr, " --use-adam Use Adam optimizer (default)\n"); - fprintf(stderr, " --no-flash Don't use flash attention.\n"); + fprintf(stderr, " --no-flash Don't use flash attention \n"); fprintf(stderr, " --use-flash Use flash attention (default)\n"); - fprintf(stderr, " --no-scratch Don't use scratch buffers\n"); - fprintf(stderr, " --use-scratch Use scratch buffers (default)\n"); - fprintf(stderr, " --warmup N Number of warmup steps (default %d)\n", params->warmup); - fprintf(stderr, " --cos-decay-steps N Number of cosine decay steps (default %d)\n", params->cos_decay_steps); - fprintf(stderr, " --cos-decay-restart N Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); - fprintf(stderr, " --cos-decay-alpha N Cosine decay alpha (default %f)\n", params->cos_decay_alpha); - fprintf(stderr, " --lbfgs-iter N Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter); + fprintf(stderr, " --no-checkpointing Don't use gradient checkpointing\n"); + fprintf(stderr, " --use-checkpointing Use gradient checkpointing (default)\n"); + fprintf(stderr, " --no-alloc Don't use allocator\n"); + fprintf(stderr, " --use-alloc Use allocator (default)\n"); + fprintf(stderr, " --warmup N Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup); + fprintf(stderr, " --cos-decay-steps N Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps); + fprintf(stderr, " --cos-decay-restart N Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); + fprintf(stderr, " --cos-decay-min N Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min); + fprintf(stderr, " --enable-restart N Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : ""); + fprintf(stderr, " --disable-restart N Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : ""); + fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past); + fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta); + fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement); + fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); + fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha); fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); + fprintf(stderr, " --adam-decay-min-ndim N Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim); + fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1); + fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); + fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); + fprintf(stderr, " --lbfgs-iter N Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter); fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb); fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); - fprintf(stderr, " --mem-compute0 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb); - fprintf(stderr, " --mem-compute1 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb); + fprintf(stderr, " --mem-compute0 N Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb); fprintf(stderr, "\n"); } @@ -2867,12 +1711,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->n_embd = std::stoi(argv[i]); - } else if (arg == "--mult") { + } else if (arg == "--ff") { if (++i >= argc) { invalid_param = true; break; } - params->n_mult = std::stoi(argv[i]); + params->n_ff = std::stoi(argv[i]); } else if (arg == "--head") { if (++i >= argc) { invalid_param = true; @@ -2885,48 +1729,48 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->n_layer = std::stoi(argv[i]); - } else if (arg == "--rotmax") { + } else if (arg == "--norm-rms-eps") { if (++i >= argc) { invalid_param = true; break; } - params->n_rotmax = std::stoi(argv[i]); - } else if (arg == "-t" || arg == "--threads") { + params->f_norm_rms_eps = std::stof(argv[i]); + } else if (arg == "--rope-freq-base") { if (++i >= argc) { invalid_param = true; break; } - params->n_threads = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch") { + params->rope_freq_base = std::stof(argv[i]); + } else if (arg == "--rope-freq-scale") { if (++i >= argc) { invalid_param = true; break; } - params->n_batch = std::stoi(argv[i]); - } else if (arg == "-n" || arg == "--examples") { + params->rope_freq_scale = std::stof(argv[i]); + } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; break; } - params->n_examples = std::stoi(argv[i]); - } else if (arg == "--predict") { + params->n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch") { if (++i >= argc) { invalid_param = true; break; } - params->n_predict = std::stoi(argv[i]); - } else if (arg == "--print-info-interval") { + params->n_batch = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--examples") { if (++i >= argc) { invalid_param = true; break; } - params->print_info_interval = std::stoi(argv[i]); - } else if (arg == "--print-details-interval") { + params->n_examples = std::stoi(argv[i]); + } else if (arg == "--print-info-interval") { if (++i >= argc) { invalid_param = true; break; } - params->print_details_interval = std::stoi(argv[i]); + params->print_info_interval = std::stoi(argv[i]); } else if (arg == "--samples-after-nl") { params->samples_start_after_nl = true; } else if (arg == "--use-lbfgs") { @@ -2937,10 +1781,14 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { params->use_flash = false; } else if (arg == "--use-flash") { params->use_flash = true; - } else if (arg == "--no-scratch") { - params->use_scratch = false; - } else if (arg == "--use-scratch") { - params->use_scratch = true; + } else if (arg == "--no-checkpointing") { + params->use_checkpointing = false; + } else if (arg == "--use-checkpointing") { + params->use_checkpointing = true; + } else if (arg == "--no-alloc") { + params->use_alloc = false; + } else if (arg == "--use-alloc") { + params->use_alloc = true; } else if (arg == "--warmup") { if (++i >= argc) { invalid_param = true; @@ -2959,18 +1807,40 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->cos_decay_restart = std::stof(argv[i]); - } else if (arg == "--cos-decay-alpha") { + } else if (arg == "--cos-decay-min") { if (++i >= argc) { invalid_param = true; break; } - params->cos_decay_alpha = std::stof(argv[i]); - } else if (arg == "--lbfgs-iter") { + params->cos_decay_min = std::stof(argv[i]); + } else if (arg == "--enable-restart") { + params->enable_restart = true; + } else if (arg == "--disable-restart") { + params->enable_restart = false; + } else if (arg == "--opt-past") { if (++i >= argc) { invalid_param = true; break; } - params->lbfgs_n_iter = std::stoi(argv[i]); + params->opt_past = std::stoi(argv[i]); + } else if (arg == "--opt-delta") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->opt_delta = std::stof(argv[i]); + } else if (arg == "--opt-max-no-improvement") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->opt_max_no_improvement = std::stoi(argv[i]); + } else if (arg == "--adam-epsf") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_eps_f = std::stof(argv[i]); } else if (arg == "--adam-iter") { if (++i >= argc) { invalid_param = true; @@ -2983,12 +1853,48 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->adam_alpha = std::stof(argv[i]); + } else if (arg == "--adam-min-alpha") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_min_alpha = std::stof(argv[i]); } else if (arg == "--adam-decay") { if (++i >= argc) { invalid_param = true; break; } params->adam_decay = std::stof(argv[i]); + } else if (arg == "--adam-decay-min-ndim") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_decay_min_ndim = std::stoi(argv[i]); + } else if (arg == "--adam-beta1") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_beta1 = std::stof(argv[i]); + } else if (arg == "--adam-beta2") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_beta2 = std::stof(argv[i]); + } else if (arg == "--adam-gclip") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_gclip = std::stof(argv[i]); + } else if (arg == "--lbfgs-iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->lbfgs_n_iter = std::stoi(argv[i]); } else if (arg == "--mem-model") { if (++i >= argc) { invalid_param = true; @@ -3007,12 +1913,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->mem_compute0_gb = std::stoi(argv[i]); - } else if (arg == "--mem-compute1") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_compute1_gb = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { train_print_usage(argc, argv, &default_params); exit(0); @@ -3031,6 +1931,63 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { return true; } +struct opt_callback_data { + struct train_params * params; + struct ggml_opt_context * opt; + struct llama_context * lctx; + llama_token * tokens_data; + size_t tokens_size; + int * samples_data; + size_t samples_size; + int shuffle_countdown; + struct ggml_tensor * tokens_input; + struct ggml_tensor * target_logits; + struct ggml_tensor * target_probs; +}; + +void opt_callback(void * vdata, float * sched) { + struct opt_callback_data * data = (struct opt_callback_data *) vdata; + struct train_params * params = data->params; + struct ggml_opt_context * opt = data->opt; + int n_batch = params->n_batch; + + *sched = (opt->iter < params->warmup) + ? (float) opt->iter / (float) params->warmup + : cosine_decay_restart( + params->cos_decay_steps, + params->cos_decay_min, + opt->iter - params->warmup, + params->cos_decay_restart, + params->enable_restart); + float min_sched = params->adam_min_alpha / params->adam_alpha; + *sched = min_sched + *sched * (1.0f - min_sched); + + int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f); + printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0); + + if (data->shuffle_countdown < n_batch) { + printf("%s: reshuffle samples\n", __func__); + shuffle_ints(data->samples_data, data->samples_data + data->samples_size); + for (int i = 0; i < (int) data->samples_size; ++i) { + GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size); + } + data->shuffle_countdown = data->samples_size; + } + + get_example_targets_batch( + data->lctx, + data->samples_data, + data->samples_size, + data->tokens_data, + data->tokens_size, + opt->iter, + data->tokens_input, + data->target_logits, + data->target_probs); + + data->shuffle_countdown -= n_batch; +} + int main(int argc, char ** argv) { struct train_params params = get_default_train_params(); @@ -3050,25 +2007,6 @@ int main(int argc, char ** argv) { struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params); struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); - struct llama_vocab vocab; - { - std::vector strings; - std::vector scores; - int n_vocab = llama_n_vocab(lctx); - strings.resize(n_vocab, NULL); - scores.resize(n_vocab, 0); - n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab); - GGML_ASSERT(n_vocab == llama_n_vocab(lctx)); - vocab.id_to_token.resize(n_vocab); - for (int i=0; i train_tokens; if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) { @@ -3080,10 +2018,14 @@ int main(int argc, char ** argv) { model.hparams.n_vocab = llama_n_vocab(lctx); model.hparams.n_ctx = params.n_ctx; model.hparams.n_embd = params.n_embd; - model.hparams.n_mult = params.n_mult; model.hparams.n_head = params.n_head; model.hparams.n_layer = params.n_layer; - model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + model.hparams.n_ff = params.n_ff; + // llama.cpp requires n_rot to be exactly n_embd / n_head + model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head; + model.hparams.f_norm_rms_eps = params.f_norm_rms_eps; + model.hparams.rope_freq_base = params.rope_freq_base; + model.hparams.rope_freq_scale = params.rope_freq_scale; print_params(&model.hparams); @@ -3105,19 +2047,12 @@ int main(int argc, char ** argv) { } printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); - struct my_llama_kv_cache kv_self; - - struct ggml_init_params lcparams; lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); lcparams.mem_buffer = NULL; lcparams.no_alloc = false; model.ctx = ggml_init(lcparams); - kv_self.ctx = model.ctx; - - my_llama_sampler sampler; - int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; @@ -3128,24 +2063,38 @@ int main(int argc, char ** argv) { struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); - opt_params_adam.print_forward_graph = false; + opt_params_adam.print_forward_graph = false; opt_params_adam.print_backward_graph = false; - opt_params_adam.n_threads = params.n_threads; - opt_params_adam.adam.n_iter = params.adam_n_iter; - opt_params_adam.adam.sched = 1.0f; - opt_params_adam.adam.alpha = params.adam_alpha; - opt_params_adam.adam.decay = params.adam_decay; - - opt_params_lbfgs.print_forward_graph = false; + opt_params_adam.n_threads = params.n_threads; + opt_params_adam.past = params.opt_past; + opt_params_adam.delta = params.opt_delta; + opt_params_adam.max_no_improvement = params.opt_max_no_improvement; + opt_params_adam.adam.n_iter = params.adam_n_iter; + opt_params_adam.adam.sched = 1.0f; + opt_params_adam.adam.alpha = params.adam_alpha; + opt_params_adam.adam.decay = params.adam_decay; + opt_params_adam.adam.decay_min_ndim = params.adam_decay_min_ndim; + opt_params_adam.adam.beta1 = params.adam_beta1; + opt_params_adam.adam.beta2 = params.adam_beta2; + opt_params_adam.adam.gclip = params.adam_gclip; + opt_params_adam.adam.eps_f = params.adam_eps_f; + + opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; - opt_params_lbfgs.n_threads = params.n_threads; - opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter; + opt_params_lbfgs.n_threads = params.n_threads; + opt_params_adam.past = params.opt_past; + opt_params_adam.delta = params.opt_delta; + opt_params_adam.max_no_improvement = params.opt_max_no_improvement; + opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter; opt->ctx = model.ctx; opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; printf("%s: init model\n", __func__); - bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true); + bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, opt); + if (!existed) { + init_model(&model); + } set_param_model(&model); opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; @@ -3158,11 +2107,7 @@ int main(int argc, char ** argv) { randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f); } - init_kv_cache(&kv_self, &model, 1); - // init_kv_cache(&kv_self, &model, n_batch); - init_sampler(&sampler, lctx); - - printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); + printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx)); // ggml_print_tensor_objects(model.ctx); // TODO: use std::vector intead of "new" @@ -3170,15 +2115,19 @@ int main(int argc, char ** argv) { uint8_t * compute_addr = new uint8_t[compute_size]; size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb); - size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb); uint8_t * compute_buf_0 = new uint8_t[size_buf_0]; - uint8_t * compute_buf_1 = new uint8_t[size_buf_1]; + + ggml_allocr * alloc = NULL; + if (params.use_alloc) { + static const size_t tensor_alignment = 32; + alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment); + } GGML_ASSERT(n_tokens < (int) train_tokens.size()); std::vector train_samples; train_samples.push_back(0); for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) { - if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) { + if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) { train_samples.push_back(i); } } @@ -3187,10 +2136,23 @@ int main(int argc, char ** argv) { GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); } - std::vector work_buffer; - printf("%s: begin training\n", __func__); + struct opt_callback_data opt_cb_data; + opt_cb_data.params = ¶ms; + opt_cb_data.opt = opt; + opt_cb_data.lctx = lctx; + opt_cb_data.tokens_data = train_tokens.data(); + opt_cb_data.tokens_size = train_tokens.size(); + opt_cb_data.samples_data = train_samples.data(); + opt_cb_data.samples_size = train_samples.size(); + opt_cb_data.shuffle_countdown = train_samples.size(); + opt_cb_data.tokens_input = NULL; + opt_cb_data.target_logits = NULL; + opt_cb_data.target_probs = NULL; + + int64_t t0 = ggml_time_ms(); + for (int ex = 0; ex < params.n_examples; ++ex) { if (ex*n_batch >= (int) train_samples.size()) { shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); @@ -3200,198 +2162,110 @@ int main(int argc, char ** argv) { } struct ggml_init_params cparams = { - /*.mem_size =*/ compute_size, - /*.mem_buffer =*/ compute_addr, - /*.no_alloc =*/ false, + compute_size, // mem_size + compute_addr, // mem_buffer + false, // no_alloc }; struct ggml_context * ctx0 = ggml_init(cparams); - struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); + ggml_set_no_alloc(ctx0, false); + + // don't use alloc for input tensors, so we can safely fill them with data + //struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); //struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); struct ggml_tensor * target_logits = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - int n_past = 0; - - struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); - struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + ggml_set_no_alloc(ctx0, (alloc != NULL)); - memset(gfbuf->data, 0, ggml_nbytes(gfbuf)); - memset(gbbuf->data, 0, ggml_nbytes(gbbuf)); + if (alloc) { + ggml_allocr_reset(alloc); + } - struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; - struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; + opt_cb_data.tokens_input = tokens_input; + opt_cb_data.target_logits = target_logits; + opt_cb_data.target_probs = target_probs; + int n_past = 0; - get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gb = ggml_new_graph(ctx0); + struct ggml_cgraph * gb_tmp = params.use_checkpointing + ? ggml_new_graph(ctx0) + : NULL; GGML_ASSERT(n_past == 0); struct ggml_tensor * loss = NULL; struct ggml_tensor * logits = NULL; - if (params.use_scratch) { - loss = forward_batch_wo_cache_flash_attn_train( - &model, ctx0, - gf, gb, - &logits, tokens_input, target_probs, - compute_buf_0, compute_buf_1, - size_buf_0, size_buf_1, - n_tokens, n_batch); - } else if (params.use_flash) { - logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch); - loss = cross_entropy_loss(ctx0, logits, target_probs); - ggml_build_forward_expand(gf, loss); - *gb = ggml_build_backward(ctx0, gf, true); - } else { - logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch); - loss = cross_entropy_loss(ctx0, logits, target_probs); - ggml_build_forward_expand(gf, loss); - *gb = ggml_build_backward(ctx0, gf, true); - } - - ggml_graph_compute_helper(work_buffer, gf, params.n_threads); + loss = llama_build_train_graphs( + &model, alloc, ctx0, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.use_flash, + params.use_checkpointing + ); size_t used_mem_before_opt = ggml_used_mem(ctx0); - float error_before_opt = ggml_get_f32_1d(loss, 0); - opt->params.adam.sched = (opt->iter < params.warmup) ? (float) opt->iter / (float) params.warmup : cosine_decay_restart( params.cos_decay_steps, - params.cos_decay_alpha, + params.cos_decay_min, opt->iter - params.warmup, - params.cos_decay_restart); + params.cos_decay_restart, + params.enable_restart); + + float min_sched = params.adam_min_alpha / params.adam_alpha; + opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched); printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); - ggml_opt_resume_g(ctx0, opt, loss, gf, gb); + ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data); size_t used_mem_after_opt = ggml_used_mem(ctx0); + int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter; model.train_its = opt->iter; - model.train_samples += n_batch; - model.train_tokens += n_batch * n_tokens; - - ggml_graph_compute_helper(work_buffer, gf, params.n_threads); - - float error_after_opt = ggml_get_f32_1d(loss, 0); + model.train_samples += n_batch * n_iter; + model.train_tokens += n_batch * n_tokens * n_iter; if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) { printf("Example %d, opt iter %d\n", ex, opt->iter); - printf("error_before_opt: %.6f\n", error_before_opt); - printf("error_after_opt: %.6f\n", error_after_opt); + printf("error_before_opt: %.6f\n", opt->loss_before); + printf("error_after_opt: %.6f\n", opt->loss_after); printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt); printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); } - if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) { - // set_logits_masked(logits, token_notavail, -1e9); - for (int i=0; idata + i*logits->nb[2] + k*logits->nb[1]), - (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), - k); - * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token; - } - } - - // printf("probabilities after optimization:\n"); - // print_matrix(after_opt_probs); - printf("Example:\n---\n"); - print_tokens_batch(lctx, tokens_input); - printf("\n---\n"); - - // printf("best samples after optimization:\n---\n"); - printf("samples after optimization:\n---\n"); - print_tokens_batch(lctx, after_opt_best_samples); - printf("\n---\n"); - } - ggml_free(ctx0); } + int64_t t1 = ggml_time_ms(); + int64_t d = t1-t0; + double dd = (double) d * 1e-3; + printf("%s: total training time=%f seconds\n", __func__, dd); + if (params.n_examples > 0) { - save_checkpoint(&model, opt, params.fn_checkpoint_out); + save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt); } if (strlen(params.fn_model_out) > 0) { - save_as_llama_model(&vocab, &model, params.fn_model_out); + save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model); } - { - int n_gen = params.n_predict; - int sample_ctx = n_tokens - n_tokens/8; - - sampler.params.temp = 0.2f; - sampler.params.repeat_penalty = 1.1f; - sampler.params.mirostat = 2; - init_sampler(&sampler, lctx); - - printf("Generating %d tokens.\n", n_gen); - - struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens); - struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - struct ggml_tensor * target_probs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - - get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs); - for (int i=sample_ctx; idata + (sample_ctx-1)*logits->nb[1]), - (llama_token *) tokens_input->data, - sample_ctx-1); - //int token = ggml_get_i32_1d(best_samples, sample_ctx-1); - - // print_row(probs, sample_at); - print_token(lctx, token); - - lshift_examples(tokens_input, target_logits, target_probs, 1); - ggml_set_i32_1d(tokens_input, 0, 0); - ggml_set_i32_1d(tokens_input, sample_ctx-1, token); - - ggml_free(ctx0); - } + if (alloc) { + ggml_allocr_free(alloc); } delete[] compute_addr; delete[] compute_buf_0; - delete[] compute_buf_1; - + ggml_free(model.ctx); llama_free(lctx); llama_free_model(lmodel); - ggml_free(model.ctx); - return 0; } diff --git a/expose.cpp b/expose.cpp index febfcba457633b1ae03cd6e113409a7623dccaf1..d385ffcb7b22176cc969adf5cda0725ec4269c3c 100644 --- a/expose.cpp +++ b/expose.cpp @@ -27,6 +27,7 @@ extern "C" //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) static FileFormat file_format = FileFormat::BADFORMAT; + static FileFormatExtraMeta file_format_meta; bool load_model(const load_model_inputs inputs) { @@ -36,11 +37,9 @@ extern "C" int forceversion = inputs.forceversion; - if(forceversion==0) - { - file_format = check_file_format(model.c_str()); - } - else + file_format = check_file_format(model.c_str(),&file_format_meta); + + if(forceversion!=0) { printf("\nWARNING: FILE FORMAT FORCED TO VER %d\nIf incorrect, loading may fail or crash.\n",forceversion); file_format = (FileFormat)forceversion; @@ -64,7 +63,7 @@ extern "C" if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4 || file_format==FileFormat::GPTJ_5) { printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); - ModelLoadResult lr = gpttype_load_model(inputs, file_format); + ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta); if (lr == ModelLoadResult::RETRY_LOAD) { if(file_format==FileFormat::GPTJ_1) @@ -73,14 +72,14 @@ extern "C" //otherwise if we tried 3 first, then try 2 file_format = FileFormat::GPTJ_4; printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); + lr = gpttype_load_model(inputs, file_format, file_format_meta); } if (lr == ModelLoadResult::RETRY_LOAD) { file_format = FileFormat::GPTJ_3; printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); + lr = gpttype_load_model(inputs, file_format, file_format_meta); } //lastly try format 2 @@ -88,7 +87,7 @@ extern "C" { file_format = FileFormat::GPTJ_2; printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); + lr = gpttype_load_model(inputs, file_format, file_format_meta); } } @@ -104,32 +103,19 @@ extern "C" else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2||file_format==FileFormat::GPT2_3||file_format==FileFormat::GPT2_4) { printf("\n---\nIdentified as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format); - ModelLoadResult lr = gpttype_load_model(inputs, file_format); + ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta); if (lr == ModelLoadResult::RETRY_LOAD) { file_format = FileFormat::GPT2_3; printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); + lr = gpttype_load_model(inputs, file_format, file_format_meta); } if (lr == ModelLoadResult::RETRY_LOAD) { file_format = FileFormat::GPT2_2; printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); - } - if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) - { - return false; + lr = gpttype_load_model(inputs, file_format, file_format_meta); } - else - { - return true; - } - } - else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2) - { - printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format); - ModelLoadResult lr = gpttype_load_model(inputs, file_format); if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) { return false; @@ -142,27 +128,27 @@ extern "C" else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5 || file_format==FileFormat::NEOX_6 || file_format==FileFormat::NEOX_7) { printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); - ModelLoadResult lr = gpttype_load_model(inputs, file_format); + ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta); if (lr == ModelLoadResult::RETRY_LOAD) { if(file_format==FileFormat::NEOX_2) { file_format = FileFormat::NEOX_3; printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); + lr = gpttype_load_model(inputs, file_format, file_format_meta); } else { file_format = FileFormat::NEOX_5; printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); + lr = gpttype_load_model(inputs, file_format, file_format_meta); } } if (lr == ModelLoadResult::RETRY_LOAD) { file_format = FileFormat::NEOX_1; printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); + lr = gpttype_load_model(inputs, file_format, file_format_meta); } if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) { @@ -173,23 +159,25 @@ extern "C" return true; } } - else if(file_format==FileFormat::MPT_1) + else { - printf("\n---\nIdentified as MPT model: (ver %d)\nAttempting to Load...\n---\n", file_format); - ModelLoadResult lr = gpttype_load_model(inputs, file_format); - if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) + if(file_format==FileFormat::MPT_1) { - return false; + printf("\n---\nIdentified as MPT model: (ver %d)\nAttempting to Load...\n---\n", file_format); + } + else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2) + { + printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format); + } + else if(file_format==FileFormat::GGUF_FALCON) + { + printf("\n---\nIdentified as FALCON model: (ver %d)\nAttempting to Load...\n---\n", file_format); } else { - return true; + printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format); } - } - else - { - printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format); - ModelLoadResult lr = gpttype_load_model(inputs, file_format); + ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta); if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) { return false; @@ -240,4 +228,10 @@ extern "C" bool abort_generate() { return gpttype_generate_abort(); } + + int token_count(const char * input) + { + std::string inputstr = input; + return gpttype_token_count(inputstr); + } } diff --git a/expose.h b/expose.h index 0905bc3ec8bb9a887f75f1b07302c1c298c60059..535e1137416951748ed12b2f76024be8d03850bb 100644 --- a/expose.h +++ b/expose.h @@ -69,8 +69,10 @@ struct generation_inputs const float mirostat_tau; const samplers sampler_order[KCPP_SAMPLER_MAX]; const int sampler_len; + const bool unban_tokens_rt; const char * stop_sequence[stop_token_max]; const bool stream_sse; + const char * grammar; }; struct generation_outputs { diff --git a/ggml-alloc.c b/ggml-alloc.c index 129af8a97ca6a488ecf78c8dbdb98c32a75b9f54..ed9f19cfdc5de8c7fa1df193df2ff26dc1cea690 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -6,8 +6,29 @@ #include #include +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include +#endif + + #define UNUSED(x) (void)(x) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) //#define GGML_ALLOCATOR_DEBUG @@ -67,6 +88,8 @@ struct ggml_allocr { struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; size_t max_size; bool measure; + int parse_seq[GGML_MAX_CONCUR]; + int parse_seq_len; #ifdef GGML_ALLOCATOR_DEBUG struct ggml_tensor * allocated_tensors[1024]; @@ -74,7 +97,7 @@ struct ggml_allocr { }; #ifdef GGML_ALLOCATOR_DEBUG -static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { +static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { if (alloc->allocated_tensors[i] == NULL) { alloc->allocated_tensors[i] = tensor; @@ -83,7 +106,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens } GGML_ASSERT(!"out of allocated_tensors"); } -static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { +static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { if (alloc->allocated_tensors[i] == tensor || (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { @@ -96,25 +119,38 @@ static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_t } #endif - -static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { +static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { return ggml_nbytes(tensor); UNUSED(alloc); } +// check if a tensor is allocated by this buffer +static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) { + void * ptr = tensor->data; + return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size; +} + +static bool ggml_is_view(struct ggml_tensor * t) { + return t->view_src != NULL; +} + void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { - size_t size = ggml_allocator_get_alloc_size(alloc, tensor); +#ifdef GGML_ALLOCATOR_DEBUG + GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources + GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated +#endif + size_t size = ggml_allocr_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); size_t max_avail = 0; - // find the best fitting free block + // find the best fitting free block besides the last block int best_fit_block = -1; size_t best_fit_size = SIZE_MAX; - for (int i = 0; i < alloc->n_free_blocks; i++) { + for (int i = 0; i < alloc->n_free_blocks - 1; i++) { struct free_block * block = &alloc->free_blocks[i]; max_avail = MAX(max_avail, block->size); if (block->size >= size && block->size <= best_fit_size) { @@ -126,10 +162,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) AT_PRINTF("block %d\n", best_fit_block); if (best_fit_block == -1) { - fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", - __func__, size, max_avail); - GGML_ASSERT(!"not enough space in the buffer"); - return; + // the last block is our last resort + struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size) { + best_fit_block = alloc->n_free_blocks - 1; + } else { + fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", + __func__, size, max_avail); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } } struct free_block * block = &alloc->free_blocks[best_fit_block]; void * addr = block->addr; @@ -163,17 +206,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) } // this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { +static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void * ptr = tensor->data; - if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) { + if (ggml_allocr_is_own(alloc, tensor) == false) { // the tensor was not allocated in this buffer // this can happen because the graph allocator will try to free weights and other tensors from different buffers // the easiest way to deal with this is just to ignore it return; } - size_t size = ggml_allocator_get_alloc_size(alloc, tensor); + size_t size = ggml_allocr_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); @@ -229,6 +272,13 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t alloc->n_free_blocks++; } +void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) { + for (int i = 0; i < n; i++) { + alloc->parse_seq[i] = list[i]; + } + alloc->parse_seq_len = n; +} + void ggml_allocr_reset(struct ggml_allocr * alloc) { alloc->n_free_blocks = 1; size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); @@ -248,8 +298,10 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ false, + /*.parse_seq = */ {0}, + /*.parse_seq_len = */ 0, #ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ = {0}, + /*.allocated_tensors = */ {0}, #endif }; @@ -258,25 +310,78 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) return alloc; } -// address and size of the buffer when measuring -// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers -static void * const MEASURE_BASE_ADDR = (void *) 0x1000; -static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB +// OS specific functions to allocate and free uncommitted virtual memory +static void * alloc_vmem(size_t size) { +#if defined(_WIN32) + return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); +#elif defined(_POSIX_MAPPED_FILES) + void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + return NULL; + } + return ptr; +#else + // use a fixed address for other platforms + uintptr_t base_addr = (uintptr_t)-size - 0x100; + return (void *)base_addr; +#endif +} + +static void free_vmem(void * base_addr, size_t size) { +#if defined(_WIN32) + VirtualFree(base_addr, 0, MEM_RELEASE); + UNUSED(size); +#elif defined(_POSIX_MAPPED_FILES) + munmap(base_addr, size); +#else + // nothing to do + UNUSED(base_addr); + UNUSED(size); +#endif +} + +// allocate uncommitted virtual memory to measure the size of the graph +static void alloc_measure_vmem(void ** base_addr, size_t * size) { + // 128GB for 64-bit, 1GB for 32-bit + *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37; + do { + *base_addr = alloc_vmem(*size); + if (*base_addr != NULL) { + AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr); + return; + } + // try again with half the size + *size /= 2; + } while (*size > 0); + + GGML_ASSERT(!"failed to allocate virtual memory for measure buffer"); +} + +static void free_measure_vmem(void * base_addr, size_t size) { + free_vmem(base_addr, size); +} struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + void * base_addr; + size_t size; + + alloc_measure_vmem(&base_addr, &size); + *alloc = (struct ggml_allocr){ - /*.data = */ MEASURE_BASE_ADDR, - /*.size = */ MEASURE_MAX_SIZE, + /*.data = */ base_addr, + /*.size = */ size, /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ true, + /*.parse_seq = */ {0}, + /*.parse_seq_len = */ 0, #ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ = {0}, + /*.allocated_tensors = */ {0}, #endif }; @@ -286,6 +391,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { } void ggml_allocr_free(struct ggml_allocr * alloc) { + if (alloc->measure) { + free_measure_vmem(alloc->data, alloc->size); + } free(alloc); } @@ -295,11 +403,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) { //////////// compute graph allocator -static bool ggml_is_view(struct ggml_tensor * t) { - return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || - t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; -} - static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { if (a->type != b->type) { return false; @@ -315,28 +418,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml return true; } -static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { - switch (t->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - return t->src[0]; - case GGML_OP_CPY: - return t->src[1]; - default: - return NULL; - } -} - -static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { - struct ggml_tensor * parent = t; - do { - parent = get_view_parent(parent); - } while (ggml_is_view(parent)); - return parent; -} - static bool ggml_op_can_inplace(enum ggml_op op) { switch (op) { case GGML_OP_SCALE: @@ -344,7 +425,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_DIAG_MASK_INF: case GGML_OP_ADD: case GGML_OP_ADD1: - case GGML_OP_ACC: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: @@ -354,7 +434,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_UNARY: case GGML_OP_ROPE: case GGML_OP_RMS_NORM: - case GGML_OP_SET: case GGML_OP_SOFT_MAX: case GGML_OP_CONT: return true; @@ -368,24 +447,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (ggml_is_view(node)) { - size_t offset; - switch(node->op) { - case GGML_OP_VIEW: - memcpy(&offset, node->op_params, sizeof(size_t)); - node->data = (char *) node->src[0]->data + offset; - break; - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - node->data = node->src[0]->data; - break; - case GGML_OP_CPY: - node->data = node->src[1]->data; - break; - default: - GGML_ASSERT(!"unknown view op"); - break; - } + assert(node->view_src->data != NULL); + node->data = (char *)node->view_src->data + node->view_offs; } else { // see if we can reuse a parent's buffer (inplace) if (ggml_op_can_inplace(node->op)) { @@ -394,10 +457,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) if (parent == NULL) { break; } + + // if the node's data is external, then we cannot re-use it + if (ggml_allocr_is_own(alloc, parent) == false) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } + struct hash_node * p_hn = hash_get(ht, parent); if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); + struct ggml_tensor * view_src = parent->view_src; struct hash_node * view_src_hn = hash_get(ht, view_src); if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite @@ -413,8 +483,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->data = parent->data; + return; } - return; } } } @@ -423,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) } } -static size_t ggml_allocator_alloc_graph_tensors_n( +static size_t ggml_allocr_alloc_graph_tensors_n( struct ggml_allocr * alloc, struct ggml_cgraph ** graphs, int n_graphs, struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { @@ -439,7 +509,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( struct ggml_tensor * node = gf->nodes[i]; if (ggml_is_view(node)) { - struct ggml_tensor * view_src = get_view_source(node); + struct ggml_tensor * view_src = node->view_src; hash_get(ht, view_src)->n_views += 1; } @@ -465,70 +535,92 @@ static size_t ggml_allocator_alloc_graph_tensors_n( allocate_node(alloc, input); } } - for (int i = 0; i < gf->n_nodes; i++) { - struct ggml_tensor * node = gf->nodes[i]; - - // allocate parents (leafs) - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; + // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers + int last_barrier_pos = 0; + int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes; + + for (int ind = 0; ind < n_nodes; ind++) { + // allocate a node if there is no parse_seq or this is not a barrier + if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) { + int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind; + struct ggml_tensor * node = gf->nodes[i]; + + // allocate parents (leafs) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + allocate_node(alloc, parent); } - allocate_node(alloc, parent); - } - // allocate node - allocate_node(alloc, node); + // allocate node + allocate_node(alloc, node); - AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - AT_PRINTF("%s", parent->name); - if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { - AT_PRINTF(", "); + AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } } + AT_PRINTF("\n"); } - AT_PRINTF("\n"); // update parents - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(ht, parent); - p_hn->n_children -= 1; - - //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - - if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); - struct hash_node * view_src_hn = hash_get(ht, view_src); - view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views); - if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { - ggml_allocator_free_tensor(alloc, view_src); + // update immediately if there is no parse_seq + // update only at barriers if there is parse_seq + if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) { + int update_start = alloc->parse_seq_len ? last_barrier_pos : ind; + int update_end = alloc->parse_seq_len ? ind : ind + 1; + for (int i = update_start; i < update_end; i++) { + int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i; + struct ggml_tensor * node = gf->nodes[node_i]; + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; } - } - else { - if (parent->data != node->data) { - ggml_allocator_free_tensor(alloc, parent); + struct hash_node * p_hn = hash_get(ht, parent); + p_hn->n_children -= 1; + + //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = hash_get(ht, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { + ggml_allocr_free_tensor(alloc, view_src); + } + } + else { + if (parent->data != node->data) { + ggml_allocr_free_tensor(alloc, parent); + } + } } } } + AT_PRINTF("\n"); + if (alloc->parse_seq_len) { + last_barrier_pos = ind + 1; + } } - AT_PRINTF("\n"); } // free graph outputs here that wouldn't be freed otherwise because they have no children if (outputs != NULL && outputs[g] != NULL) { for (int i = 0; outputs[g][i] != NULL; i++) { struct ggml_tensor * output = outputs[g][i]; AT_PRINTF("output: %s\n", output->name); - ggml_allocator_free_tensor(alloc, output); + ggml_allocr_free_tensor(alloc, output); } } } @@ -537,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n( } size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { - return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); + return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); } diff --git a/ggml-alloc.h b/ggml-alloc.h index a5ec8f87a94530840ef7c030a6db9c3e9cdef6e0..9559da75871a608fb29f08edebf860674295e043 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -10,6 +10,10 @@ extern "C" { GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); +// tell the allocator to parse nodes following the order described in the list +// you should call this if your graph are optimized to execute out-of-order +GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n); + GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 856ebf1c1ae159876821d383ce73ac5767814ec1..78ba94f9a121949985810ba405399d4c6e93c158 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6,14 +6,146 @@ #include #include +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#else #include #include #include +#endif // defined(GGML_USE_HIPBLAS) #include "ggml-cuda.h" #include "ggml.h" -#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define CC_TURING 700 +#define CC_OFFSET_AMD 1000000 +#define CC_RDNA2 CC_OFFSET_AMD + 1030 + +#if defined(GGML_USE_HIPBLAS) +#define __CUDA_ARCH__ 1300 + +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ + defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3 +#endif + +#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) +#define RDNA2 +#endif + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) +} + +static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(__gfx1100__) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; + asm("\n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + " + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) + : "v"(a), "v"(b) + ); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; +} +#endif // defined(GGML_USE_HIPBLAS) #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -25,8 +157,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); do { \ cudaError_t err_ = (err); \ if (err_ != cudaSuccess) { \ - fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + int id; \ + cudaGetDevice(&id); \ + fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ cudaGetErrorString(err_)); \ + fprintf(stderr, "current device: %d\n", id); \ exit(1); \ } \ } while (0) @@ -36,8 +171,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); do { \ cublasStatus_t err_ = (err); \ if (err_ != CUBLAS_STATUS_SUCCESS) { \ + int id; \ + cudaGetDevice(&id); \ fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \ err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \ + fprintf(stderr, "current device: %d\n", id); \ exit(1); \ } \ } while (0) @@ -46,12 +184,21 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); do { \ cublasStatus_t err_ = (err); \ if (err_ != CUBLAS_STATUS_SUCCESS) { \ + int id; \ + cudaGetDevice(&id); \ fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + fprintf(stderr, "current device: %d\n", id); \ exit(1); \ } \ } while (0) #endif // CUDART_VERSION >= 11 +#if CUDART_VERSION >= 11100 +#define GGML_CUDA_ASSUME(x) __builtin_assume(x) +#else +#define GGML_CUDA_ASSUME(x) +#endif // CUDART_VERSION >= 11100 + #ifdef GGML_CUDA_F16 typedef half dfloat; // dequantize float typedef half2 dfloat2; @@ -93,10 +240,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (*ggml_cuda_op_t)( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, - float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main); +typedef void (*ggml_cuda_op_mul_mat_t)( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream); +typedef void (*ggml_cuda_op_flatten_t)( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream); // QK = number of values after dequantization // QR = QK / number of values before dequantization @@ -204,11 +354,11 @@ typedef struct { #define QI4_K (QK_K / (4*QR4_K)) #ifdef GGML_QKK_64 typedef struct { - half d[2]; // super-block scales/mins + half dm[2]; // super-block scales/mins uint8_t scales[2]; // 4-bit block scales/mins uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); +static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding"); #else typedef struct { half2 dm; // super-block scale for quantized scales/mins @@ -258,14 +408,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_CPY_BLOCK_SIZE 32 #define CUDA_SCALE_BLOCK_SIZE 256 #define CUDA_ROPE_BLOCK_SIZE 256 +#define CUDA_ALIBI_BLOCK_SIZE 32 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 #define CUDA_QUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#ifndef GGML_CUDA_MMQ_Y -#define GGML_CUDA_MMQ_Y 64 -#endif // GGML_CUDA_MMQ_Y - // dmmv = dequantize_mul_mat_vec #ifndef GGML_CUDA_DMMV_X #define GGML_CUDA_DMMV_X 32 @@ -280,11 +427,45 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); #endif +#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE +#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128 +#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE + +#define MUL_MAT_SRC1_COL_STRIDE 128 + +#define MAX_STREAMS 8 +static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr }; + struct ggml_tensor_extra_gpu { void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors - cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs + cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs }; +// this is faster on Windows +// probably because the Windows CUDA libraries forget to make this check before invoking the drivers +inline cudaError_t ggml_cuda_set_device(const int device) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + + if (device == current_device) { + return cudaSuccess; + } + + return cudaSetDevice(device); +} + +static int g_device_count = -1; +static int g_main_device = 0; +static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; +static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static bool g_mul_mat_q = true; + +static void * g_scratch_buffer = nullptr; +static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_offset = 0; + +static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -334,58 +515,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) { dst[i] = x[i] / (1.0f + expf(-x[i])); } +static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); + a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); + } + return a; +} + +template static __global__ void norm_f32(const float * x, float * dst, const int ncols) { const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; const float eps = 1e-5f; - float mean = 0.0f; - float var = 0.0f; + float2 mean_var = make_float2(0.f, 0.f); - for (int col = tid; col < ncols; col += WARP_SIZE) { + for (int col = tid; col < ncols; col += block_size) { const float xi = x[row*ncols + col]; - mean += xi; - var += xi * xi; + mean_var.x += xi; + mean_var.y += xi * xi; } // sum up partial sums -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - mean += __shfl_xor_sync(0xffffffff, mean, mask, 32); - var += __shfl_xor_sync(0xffffffff, var, mask, 32); + mean_var = warp_reduce_sum(mean_var); + if (block_size > WARP_SIZE) { + __shared__ float2 s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = mean_var; + } + __syncthreads(); + mean_var = s_sum[lane_id]; + mean_var = warp_reduce_sum(mean_var); } - mean /= ncols; - var = var / ncols - mean * mean; - const float inv_var = rsqrtf(var + eps); + const float mean = mean_var.x / ncols; + const float var = mean_var.y / ncols - mean * mean; + const float inv_std = rsqrtf(var + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; + } +} - for (int col = tid; col < ncols; col += WARP_SIZE) { - dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var; +static __device__ __forceinline__ float warp_reduce_sum(float x) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, mask, 32); } + return x; } +template static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) { const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; float tmp = 0.0f; // partial sum for thread in warp - for (int col = tid; col < ncols; col += WARP_SIZE) { + for (int col = tid; col < ncols; col += block_size) { const float xi = x[row*ncols + col]; tmp += xi * xi; } // sum up partial sums -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + tmp = warp_reduce_sum(tmp); + if (block_size > WARP_SIZE) { + __shared__ float s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + __syncthreads(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp); } const float mean = tmp / ncols; const float scale = rsqrtf(mean + eps); - for (int col = tid; col < ncols; col += WARP_SIZE) { + for (int col = tid; col < ncols; col += block_size) { dst[row*ncols + col] = scale * x[row*ncols + col]; } } @@ -412,8 +626,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q4_1 * x = (const block_q4_1 *) vx; - const dfloat d = x[ib].dm.x; - const dfloat m = x[ib].dm.y; + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); const int vui = x[ib].qs[iqs]; @@ -455,8 +669,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q5_1 * x = (const block_q5_1 *) vx; - const dfloat d = x[ib].dm.x; - const dfloat m = x[ib].dm.y; + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); @@ -508,8 +722,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; - float dall = x[i].dm.x; - float dmin = x[i].dm.y; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); @@ -519,8 +733,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int il = tid%16; // 0...15 const uint8_t q = x[i].qs[il] >> (2*is); float * y = yy + i*QK_K + 16*is + il; - float dall = x[i].dm.x; - float dmin = x[i].dm.y; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); #endif @@ -606,8 +820,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + n*ir; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint8_t * q = x[i].qs + 32*il + n*ir; @@ -624,8 +838,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float const int tid = threadIdx.x; const uint8_t * q = x[i].qs; float * y = yy + i*QK_K; - const float d = (float)x[i].d[0]; - const float m = (float)x[i].d[1]; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); #endif @@ -645,8 +859,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float float * y = yy + i*QK_K + 64*il + 2*ir; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint8_t * ql = x[i].qs + 32*il + 2*ir; const uint8_t * qh = x[i].qh + 2*ir; @@ -758,8 +972,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); aux[0] = a[0] & 0x0f0f0f0f; @@ -979,8 +1193,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1042,8 +1256,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const uint16_t * a = (const uint16_t *)x[i].scales; aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; - const float d = (float)x[i].d[0]; - const float m = (float)x[i].d[1]; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; float sum = 0.f; for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) @@ -1112,8 +1326,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm.x; - const float dmin = x[i].dm.y; + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1336,8 +1550,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest return; } - y[ib].ds.x = d; - y[ib].ds.y = sum; + reinterpret_cast(y[ib].ds.x) = d; + reinterpret_cast(y[ib].ds.y) = sum; } template @@ -1388,6 +1602,7 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1425,6 +1640,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1460,6 +1676,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp // second part effectively subtracts 16 from each quant value return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1505,6 +1722,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp return sumi*d5d8 + m5s8 / (QI5_1 / vdr); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1526,6 +1744,7 @@ template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp return d8_0*d8_1 * sumi; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1549,13 +1768,14 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #else const float2 dm8f = __half22float2(dm8); const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8.x * ds8.x; - const float m8s8 = dm8.y * ds8.y; + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1591,6 +1811,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( return dm2f.x*sumf_d - dm2f.y*sumf_m; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1628,6 +1849,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1668,6 +1890,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( return d3 * sumf; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1693,6 +1916,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( return d3*d8 * sumi; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1726,12 +1950,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values -// also used for q5_K static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { @@ -1741,19 +1965,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( float sumf_m = 0.0f; #pragma unroll - for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) { + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { int sumi_d = 0; #pragma unroll - for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) { - sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product - sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product } - const float2 ds8f = __half22float2(ds8[i0 / 4]); + const float2 ds8f = __half22float2(ds8[i]); - sumf_d += ds8f.x * (sc[i0/4] * sumi_d); - sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val } const float2 dm4f = __half22float2(dm4); @@ -1761,6 +1984,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1769,7 +1993,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( #define VDR_Q5_K_Q8_1_MMQ 8 // contiguous v/x values -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { @@ -1801,6 +2025,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( return dm5f.x*sumf_d - dm5f.y*sumf_m; #else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1831,6 +2090,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( return d*sumf; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1862,6 +2122,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( return d6 * sumf_d; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1884,23 +2145,23 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q4_0( +template static __device__ __forceinline__ void load_tiles_q4_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI4_0; const int kqsx = k % QI4_0; @@ -1910,7 +2171,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -1920,39 +2181,30 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; } -// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; -// const int kbxd = k % blocks_per_tile_x_row; + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; -// #pragma unroll -// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) { -// FIXME out-of-bounds -// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; -// if (i >= GGML_CUDA_MMQ_Y) { -// return; -// } + if (need_check) { + i = min(i, i_max); + } -// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; -// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d; -// } + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_0_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const float * x_dmf = (float *) x_dm; @@ -1960,13 +2212,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; } return vec_dot_q4_0_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q4_1_q8_1( @@ -1987,23 +2239,23 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; *x_ql = tile_x_qs; *x_dm = tile_x_dm; } -template static __device__ __forceinline__ void load_tiles_q4_1( +template static __device__ __forceinline__ void load_tiles_q4_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI4_1; const int kqsx = k % QI4_1; @@ -2011,7 +2263,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_1 * bx0 = (block_q4_1 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2027,7 +2279,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; if (need_check) { @@ -2044,27 +2296,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_1_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); int u[2*VDR_Q4_1_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; } return vec_dot_q4_1_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_0_q8_1( @@ -2087,23 +2331,23 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; *x_ql = tile_x_ql; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q5_0( +template static __device__ __forceinline__ void load_tiles_q5_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI5_0; const int kqsx = k % QI5_0; @@ -2111,7 +2355,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_0 * bx0 = (block_q5_0 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2147,7 +2391,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; if (need_check) { @@ -2164,14 +2408,6 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_0_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; const float * x_dmf = (const float *) x_dm; @@ -2181,12 +2417,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; } return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( @@ -2209,23 +2445,23 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; } -template static __device__ __forceinline__ void load_tiles_q5_1( +template static __device__ __forceinline__ void load_tiles_q5_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI5_1; const int kqsx = k % QI5_1; @@ -2233,7 +2469,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_1 * bx0 = (block_q5_1 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2266,7 +2502,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; if (need_check) { @@ -2283,14 +2519,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_1_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; @@ -2298,12 +2526,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; } return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q8_0_q8_1( @@ -2320,26 +2548,26 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); } - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds.x); + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); } -static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q8_0( +template static __device__ __forceinline__ void load_tiles_q8_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI8_0; const int kqsx = k % QI8_0; @@ -2348,7 +2576,7 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bx0 = (block_q8_0 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2358,41 +2586,29 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); - x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d; } -// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; -// const int kbxd = k % blocks_per_tile_x_row; + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; -// #pragma unroll -// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) { -// FIXME out-of-bounds -// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; -// #if GGML_CUDA_MMQ_Y < 64 -// if (i >= GGML_CUDA_MMQ_Y) { -// return; -// } -// #endif // GGML_CUDA_MMQ_Y < 64 + if (need_check) { + i = min(i, i_max); + } -// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; -// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d; -// } + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q8_0_Q8_1_MMQ == 0); - const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -2418,31 +2634,31 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( #pragma unroll for (int i = 0; i < QR2_K; ++ i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); } return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); } -static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q2_K( +template static __device__ __forceinline__ void load_tiles_q2_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI2_K; const int kqsx = k % QI2_K; @@ -2450,7 +2666,7 @@ template static __device__ __forceinline__ void load_tiles_q2_ const block_q2_K * bx0 = (block_q2_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2466,8 +2682,8 @@ template static __device__ __forceinline__ void load_tiles_q2_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) { - int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2479,7 +2695,7 @@ template static __device__ __forceinline__ void load_tiles_q2_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); if (need_check) { @@ -2496,14 +2712,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q2_K_Q8_1_MMQ == 0); - const int kbx = k / QI2_K; const int ky = (k % QI2_K) * QR2_K; const float * y_df = (const float *) y_ds; @@ -2520,7 +2728,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; - const int index_y = j * (QR2_K*WARP_SIZE) + QR2_K*k; + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); } @@ -2545,18 +2753,18 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( #pragma unroll for (int i = 0; i < QR3_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + i].ds); } return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); } -static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; + __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -2564,14 +2772,14 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q3_K( +template static __device__ __forceinline__ void load_tiles_q3_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI3_K; const int kqsx = k % QI3_K; @@ -2579,7 +2787,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ const block_q3_K * bx0 = (block_q3_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2596,8 +2804,8 @@ template static __device__ __forceinline__ void load_tiles_q3_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) { - int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2609,7 +2817,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); if (need_check) { @@ -2623,7 +2831,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); if (need_check) { @@ -2652,14 +2860,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q3_K_Q8_1_MMQ == 0); - const int kbx = k / QI3_K; const int ky = (k % QI3_K) * QR3_K; const float * x_dmf = (const float *) x_dm; @@ -2681,7 +2881,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( v[l] = __vsubss4(vll, vlh); } - const int index_y = j * (QR3_K*WARP_SIZE) + k*QR3_K; + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); } @@ -2722,7 +2922,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds.x; + d8[i] = __low2half(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -2746,11 +2946,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; - const float dall = bq4_K->d[0]; - const float dmin = bq4_K->d[1]; + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; - const float d8_1 = bq8_1[0].ds.x; - const float d8_2 = bq8_1[1].ds.x; + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); @@ -2772,31 +2972,32 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( return dall * sumf_d - dmin * sumf_m; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif } -static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q4_K( +template static __device__ __forceinline__ void load_tiles_q4_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI4_K; // == 0 if QK_K == 256 const int kqsx = k % QI4_K; // == k if QK_K == 256 @@ -2804,7 +3005,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_K * bx0 = (block_q4_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2820,8 +3021,8 @@ template static __device__ __forceinline__ void load_tiles_q4_ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) { - int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2829,12 +3030,16 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; +#if QK_K == 256 x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2858,26 +3063,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_K_Q8_1_MMQ == 0); - - int v[QR4_K*VDR_Q4_K_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) { - v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F; - v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F; - } - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); - const int index_y = j * (QR4_K*WARP_SIZE) + QR4_K*k; - return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_K_q8_1( @@ -2917,14 +3107,14 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( #pragma unroll for (int i = 0; i < QR5_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds.x; + d8[i] = __low2float(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; u[2*i+1] = q8[4]; } - return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8); + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); #else @@ -2935,8 +3125,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( const float d = bq5_K->d; - const float d8_1 = bq8_1[0].ds.x; - const float d8_2 = bq8_1[1].ds.x; + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); @@ -2963,31 +3153,32 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( return d * sumf_d; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif } -static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q5_K( +template static __device__ __forceinline__ void load_tiles_q5_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI5_K; // == 0 if QK_K == 256 const int kqsx = k % QI5_K; // == k if QK_K == 256 @@ -2995,7 +3186,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_K * bx0 = (block_q5_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -3024,8 +3215,8 @@ template static __device__ __forceinline__ void load_tiles_q5_ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) { - int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3033,12 +3224,14 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; +#if QK_K == 256 x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3062,19 +3255,12 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_K_Q8_1_MMQ == 0); - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; - const int index_y = j * (QR5_K*WARP_SIZE) + QR5_K*k; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q6_K_q8_1( @@ -3097,31 +3283,31 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( #pragma unroll for (int i = 0; i < QR6_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + 2*i].ds.x; + d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds); } return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); } -static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q6_K( +template static __device__ __forceinline__ void load_tiles_q6_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256 @@ -3129,7 +3315,7 @@ template static __device__ __forceinline__ void load_tiles_q6_ const block_q6_K * bx0 = (block_q6_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -3159,8 +3345,8 @@ template static __device__ __forceinline__ void load_tiles_q6_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) { - int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3172,8 +3358,8 @@ template static __device__ __forceinline__ void load_tiles_q6_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3189,27 +3375,19 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q6_K_Q8_1_MMQ == 0); - const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; - const int index_y = j * (QR6_K*WARP_SIZE) + QR6_K*k; + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } -template -static __global__ void mul_mat_q( +static __device__ __forceinline__ void mul_mat_q( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -3222,14 +3400,10 @@ static __global__ void mul_mat_q( const int & ncols_dst = ncols_y; - const int tid_x = threadIdx.x; - const int tid_y = threadIdx.y; - - const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y; + const int row_dst_0 = blockIdx.x*mmq_y; const int & row_x_0 = row_dst_0; - const int row_dst = row_dst_0 + tid_x; - const int col_dst_0 = blockIdx.y*WARP_SIZE; + const int col_dst_0 = blockIdx.y*mmq_x; const int & col_y_0 = col_dst_0; int * tile_x_ql = nullptr; @@ -3239,84 +3413,706 @@ static __global__ void mul_mat_q( allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1; + __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; + __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; - __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)]; - __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col]; - - float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f}; + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f}; for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, - tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x); + threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x); +#pragma unroll for (int ir = 0; ir < qr; ++ir) { - const int kqs = ir*WARP_SIZE + tid_x; + const int kqs = ir*WARP_SIZE + threadIdx.x; const int kbxd = kqs / QI8_1; - for (int i = 0; i < WARP_SIZE; i += 8) { - const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1); + const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); } - } - for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) { - const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE; - const int kby = tid_x % blocks_per_tile_y_col; - const int col_y_eff = min(col_y_0 + ids, ncols_y-1); - - // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby]; - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src).x; +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; + const int kby = threadIdx.x % (WARP_SIZE/QI8_1); + const int col_y_eff = min(col_y_0 + ids, ncols_y-1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = __low2half(*dsi_src); + } } - } - __syncthreads(); + __syncthreads(); -#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { #pragma unroll -#endif // __CUDA_ARCH__ >= 700 - for (int k = 0; k < WARP_SIZE; k += vdr) { + for (int j = 0; j < mmq_x; j += nwarps) { #pragma unroll - for (int j = 0; j < WARP_SIZE; j += 8) { -#pragma unroll - for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { - sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, - tid_x + i, tid_y + j, k); + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i/WARP_SIZE][j/nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, + threadIdx.x + i, threadIdx.y + j, k); + } } } - } - - __syncthreads(); - } - - if (row_dst >= nrows_dst) { - return; + __syncthreads(); + } } - for (int j = 0; j < WARP_SIZE; j += 8) { - const int col_dst = col_dst_0 + j + tid_y; +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + threadIdx.y; if (col_dst >= ncols_dst) { return; } - for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { - dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8]; +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + const int row_dst = row_dst_0 + threadIdx.x + i; + + if (row_dst >= nrows_dst) { + continue; + } + + dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; } } } +#define MMQ_X_Q4_0_RDNA2 64 +#define MMQ_Y_Q4_0_RDNA2 128 +#define NWARPS_Q4_0_RDNA2 8 +#define MMQ_X_Q4_0_RDNA1 64 +#define MMQ_Y_Q4_0_RDNA1 64 +#define NWARPS_Q4_0_RDNA1 8 +#define MMQ_X_Q4_0_AMPERE 64 +#define MMQ_Y_Q4_0_AMPERE 128 +#define NWARPS_Q4_0_AMPERE 4 +#define MMQ_X_Q4_0_PASCAL 64 +#define MMQ_Y_Q4_0_PASCAL 64 +#define NWARPS_Q4_0_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q4_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_0_RDNA2; + const int mmq_y = MMQ_Y_Q4_0_RDNA2; + const int nwarps = NWARPS_Q4_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_0_RDNA1; + const int mmq_y = MMQ_Y_Q4_0_RDNA1; + const int nwarps = NWARPS_Q4_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_0_AMPERE; + const int mmq_y = MMQ_Y_Q4_0_AMPERE; + const int nwarps = NWARPS_Q4_0_AMPERE; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_0_PASCAL; + const int mmq_y = MMQ_Y_Q4_0_PASCAL; + const int nwarps = NWARPS_Q4_0_PASCAL; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q4_1_RDNA2 64 +#define MMQ_Y_Q4_1_RDNA2 128 +#define NWARPS_Q4_1_RDNA2 8 +#define MMQ_X_Q4_1_RDNA1 64 +#define MMQ_Y_Q4_1_RDNA1 64 +#define NWARPS_Q4_1_RDNA1 8 +#define MMQ_X_Q4_1_AMPERE 64 +#define MMQ_Y_Q4_1_AMPERE 128 +#define NWARPS_Q4_1_AMPERE 4 +#define MMQ_X_Q4_1_PASCAL 64 +#define MMQ_Y_Q4_1_PASCAL 64 +#define NWARPS_Q4_1_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q4_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_1_RDNA2; + const int mmq_y = MMQ_Y_Q4_1_RDNA2; + const int nwarps = NWARPS_Q4_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_1_RDNA1; + const int mmq_y = MMQ_Y_Q4_1_RDNA1; + const int nwarps = NWARPS_Q4_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_1_AMPERE; + const int mmq_y = MMQ_Y_Q4_1_AMPERE; + const int nwarps = NWARPS_Q4_1_AMPERE; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_1_PASCAL; + const int mmq_y = MMQ_Y_Q4_1_PASCAL; + const int nwarps = NWARPS_Q4_1_PASCAL; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_1_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_0_RDNA2 64 +#define MMQ_Y_Q5_0_RDNA2 128 +#define NWARPS_Q5_0_RDNA2 8 +#define MMQ_X_Q5_0_RDNA1 64 +#define MMQ_Y_Q5_0_RDNA1 64 +#define NWARPS_Q5_0_RDNA1 8 +#define MMQ_X_Q5_0_AMPERE 128 +#define MMQ_Y_Q5_0_AMPERE 64 +#define NWARPS_Q5_0_AMPERE 4 +#define MMQ_X_Q5_0_PASCAL 64 +#define MMQ_Y_Q5_0_PASCAL 64 +#define NWARPS_Q5_0_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q5_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_0_RDNA2; + const int mmq_y = MMQ_Y_Q5_0_RDNA2; + const int nwarps = NWARPS_Q5_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_0_RDNA1; + const int mmq_y = MMQ_Y_Q5_0_RDNA1; + const int nwarps = NWARPS_Q5_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_0_AMPERE; + const int mmq_y = MMQ_Y_Q5_0_AMPERE; + const int nwarps = NWARPS_Q5_0_AMPERE; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_0_PASCAL; + const int mmq_y = MMQ_Y_Q5_0_PASCAL; + const int nwarps = NWARPS_Q5_0_PASCAL; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_1_RDNA2 64 +#define MMQ_Y_Q5_1_RDNA2 128 +#define NWARPS_Q5_1_RDNA2 8 +#define MMQ_X_Q5_1_RDNA1 64 +#define MMQ_Y_Q5_1_RDNA1 64 +#define NWARPS_Q5_1_RDNA1 8 +#define MMQ_X_Q5_1_AMPERE 128 +#define MMQ_Y_Q5_1_AMPERE 64 +#define NWARPS_Q5_1_AMPERE 4 +#define MMQ_X_Q5_1_PASCAL 64 +#define MMQ_Y_Q5_1_PASCAL 64 +#define NWARPS_Q5_1_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_1_RDNA2; + const int mmq_y = MMQ_Y_Q5_1_RDNA2; + const int nwarps = NWARPS_Q5_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_1_RDNA1; + const int mmq_y = MMQ_Y_Q5_1_RDNA1; + const int nwarps = NWARPS_Q5_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_1_AMPERE; + const int mmq_y = MMQ_Y_Q5_1_AMPERE; + const int nwarps = NWARPS_Q5_1_AMPERE; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_1_PASCAL; + const int mmq_y = MMQ_Y_Q5_1_PASCAL; + const int nwarps = NWARPS_Q5_1_PASCAL; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_1_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q8_0_RDNA2 64 +#define MMQ_Y_Q8_0_RDNA2 128 +#define NWARPS_Q8_0_RDNA2 8 +#define MMQ_X_Q8_0_RDNA1 64 +#define MMQ_Y_Q8_0_RDNA1 64 +#define NWARPS_Q8_0_RDNA1 8 +#define MMQ_X_Q8_0_AMPERE 128 +#define MMQ_Y_Q8_0_AMPERE 64 +#define NWARPS_Q8_0_AMPERE 4 +#define MMQ_X_Q8_0_PASCAL 64 +#define MMQ_Y_Q8_0_PASCAL 64 +#define NWARPS_Q8_0_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q8_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q8_0_RDNA2; + const int mmq_y = MMQ_Y_Q8_0_RDNA2; + const int nwarps = NWARPS_Q8_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q8_0_RDNA1; + const int mmq_y = MMQ_Y_Q8_0_RDNA1; + const int nwarps = NWARPS_Q8_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q8_0_AMPERE; + const int mmq_y = MMQ_Y_Q8_0_AMPERE; + const int nwarps = NWARPS_Q8_0_AMPERE; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q8_0_PASCAL; + const int mmq_y = MMQ_Y_Q8_0_PASCAL; + const int nwarps = NWARPS_Q8_0_PASCAL; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q8_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q2_K_RDNA2 64 +#define MMQ_Y_Q2_K_RDNA2 128 +#define NWARPS_Q2_K_RDNA2 8 +#define MMQ_X_Q2_K_RDNA1 128 +#define MMQ_Y_Q2_K_RDNA1 32 +#define NWARPS_Q2_K_RDNA1 8 +#define MMQ_X_Q2_K_AMPERE 64 +#define MMQ_Y_Q2_K_AMPERE 128 +#define NWARPS_Q2_K_AMPERE 4 +#define MMQ_X_Q2_K_PASCAL 64 +#define MMQ_Y_Q2_K_PASCAL 64 +#define NWARPS_Q2_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q2_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q2_K_RDNA2; + const int mmq_y = MMQ_Y_Q2_K_RDNA2; + const int nwarps = NWARPS_Q2_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q2_K_RDNA1; + const int mmq_y = MMQ_Y_Q2_K_RDNA1; + const int nwarps = NWARPS_Q2_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q2_K_AMPERE; + const int mmq_y = MMQ_Y_Q2_K_AMPERE; + const int nwarps = NWARPS_Q2_K_AMPERE; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q2_K_PASCAL; + const int mmq_y = MMQ_Y_Q2_K_PASCAL; + const int nwarps = NWARPS_Q2_K_PASCAL; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q2_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q3_K_RDNA2 128 +#define MMQ_Y_Q3_K_RDNA2 64 +#define NWARPS_Q3_K_RDNA2 8 +#define MMQ_X_Q3_K_RDNA1 32 +#define MMQ_Y_Q3_K_RDNA1 128 +#define NWARPS_Q3_K_RDNA1 8 +#define MMQ_X_Q3_K_AMPERE 128 +#define MMQ_Y_Q3_K_AMPERE 128 +#define NWARPS_Q3_K_AMPERE 4 +#define MMQ_X_Q3_K_PASCAL 64 +#define MMQ_Y_Q3_K_PASCAL 64 +#define NWARPS_Q3_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q3_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q3_K_RDNA2; + const int mmq_y = MMQ_Y_Q3_K_RDNA2; + const int nwarps = NWARPS_Q3_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q3_K_RDNA1; + const int mmq_y = MMQ_Y_Q3_K_RDNA1; + const int nwarps = NWARPS_Q3_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q3_K_AMPERE; + const int mmq_y = MMQ_Y_Q3_K_AMPERE; + const int nwarps = NWARPS_Q3_K_AMPERE; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q3_K_PASCAL; + const int mmq_y = MMQ_Y_Q3_K_PASCAL; + const int nwarps = NWARPS_Q3_K_PASCAL; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q3_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q4_K_RDNA2 64 +#define MMQ_Y_Q4_K_RDNA2 128 +#define NWARPS_Q4_K_RDNA2 8 +#define MMQ_X_Q4_K_RDNA1 32 +#define MMQ_Y_Q4_K_RDNA1 64 +#define NWARPS_Q4_K_RDNA1 8 +#define MMQ_X_Q4_K_AMPERE 64 +#define MMQ_Y_Q4_K_AMPERE 128 +#define NWARPS_Q4_K_AMPERE 4 +#define MMQ_X_Q4_K_PASCAL 64 +#define MMQ_Y_Q4_K_PASCAL 64 +#define NWARPS_Q4_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q4_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_K_RDNA2; + const int mmq_y = MMQ_Y_Q4_K_RDNA2; + const int nwarps = NWARPS_Q4_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_K_RDNA1; + const int mmq_y = MMQ_Y_Q4_K_RDNA1; + const int nwarps = NWARPS_Q4_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_K_AMPERE; + const int mmq_y = MMQ_Y_Q4_K_AMPERE; + const int nwarps = NWARPS_Q4_K_AMPERE; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_K_PASCAL; + const int mmq_y = MMQ_Y_Q4_K_PASCAL; + const int nwarps = NWARPS_Q4_K_PASCAL; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_K_RDNA2 64 +#define MMQ_Y_Q5_K_RDNA2 128 +#define NWARPS_Q5_K_RDNA2 8 +#define MMQ_X_Q5_K_RDNA1 32 +#define MMQ_Y_Q5_K_RDNA1 64 +#define NWARPS_Q5_K_RDNA1 8 +#define MMQ_X_Q5_K_AMPERE 64 +#define MMQ_Y_Q5_K_AMPERE 128 +#define NWARPS_Q5_K_AMPERE 4 +#define MMQ_X_Q5_K_PASCAL 64 +#define MMQ_Y_Q5_K_PASCAL 64 +#define NWARPS_Q5_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_K_RDNA2; + const int mmq_y = MMQ_Y_Q5_K_RDNA2; + const int nwarps = NWARPS_Q5_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_K_RDNA1; + const int mmq_y = MMQ_Y_Q5_K_RDNA1; + const int nwarps = NWARPS_Q5_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_K_AMPERE; + const int mmq_y = MMQ_Y_Q5_K_AMPERE; + const int nwarps = NWARPS_Q5_K_AMPERE; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_K_PASCAL; + const int mmq_y = MMQ_Y_Q5_K_PASCAL; + const int nwarps = NWARPS_Q5_K_PASCAL; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q6_K_RDNA2 64 +#define MMQ_Y_Q6_K_RDNA2 128 +#define NWARPS_Q6_K_RDNA2 8 +#define MMQ_X_Q6_K_RDNA1 32 +#define MMQ_Y_Q6_K_RDNA1 64 +#define NWARPS_Q6_K_RDNA1 8 +#define MMQ_X_Q6_K_AMPERE 64 +#define MMQ_Y_Q6_K_AMPERE 64 +#define NWARPS_Q6_K_AMPERE 4 +#define MMQ_X_Q6_K_PASCAL 64 +#define MMQ_Y_Q6_K_PASCAL 64 +#define NWARPS_Q6_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q6_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q6_K_RDNA2; + const int mmq_y = MMQ_Y_Q6_K_RDNA2; + const int nwarps = NWARPS_Q6_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q6_K_RDNA1; + const int mmq_y = MMQ_Y_Q6_K_RDNA1; + const int nwarps = NWARPS_Q6_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q6_K_AMPERE; + const int mmq_y = MMQ_Y_Q6_K_AMPERE; + const int nwarps = NWARPS_Q6_K_AMPERE; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q6_K_PASCAL; + const int mmq_y = MMQ_Y_Q6_K_PASCAL; + const int nwarps = NWARPS_Q6_K_PASCAL; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q6_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + template static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; @@ -3561,13 +4357,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, // rope == RoPE == rotary positional embedding static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0, const float p_delta, const int p_delta_rows, const float theta_scale) { - const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x); + const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (col >= ncols) { return; } - const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int row = blockDim.x*blockIdx.x + threadIdx.x; const int i = row*ncols + col; const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); @@ -3581,7 +4377,30 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c dst[i + 1] = x0*sin_theta + x1*cos_theta; } -static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { +static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale) { + const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); + + if (col >= ncols) { + return; + } + + const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int i = row*ncols + col/2; + + const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + ncols/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; +} + +static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) { const int col = blockDim.x*blockIdx.x + threadIdx.x; const int half_n_dims = ncols/4; @@ -3593,8 +4412,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol const int i = row*ncols + col; const float col_theta_scale = powf(theta_scale, col); + const float p = p0 + p_delta*(row/p_delta_rows); - const float theta = p*col_theta_scale; + const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale; const float sin_theta = sinf(theta); const float cos_theta = cosf(theta); @@ -3604,7 +4424,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; - const float block_theta = block_p*col_theta_scale; + const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale; const float sin_block_theta = sinf(block_theta); const float cos_block_theta = cosf(block_theta); @@ -3615,9 +4435,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; } -static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { +static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows, + const int n_heads_log2_floor, const float m0, const float m1) { const int col = blockDim.x*blockIdx.x + threadIdx.x; + + if (col >= ncols) { + return; + } + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int i = row*ncols + col; + + const int k = row/k_rows; + + float m_k; + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + dst[i] = col * m_k + x[i]; +} + +static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { + const int col = blockDim.y*blockIdx.y + threadIdx.y; + const int row = blockDim.x*blockIdx.x + threadIdx.x; if (col >= ncols) { return; @@ -3630,24 +4473,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int // the CUDA soft max implementation differs from the CPU implementation // instead of doubles floats are used -// values are also not normalized to the maximum value by subtracting it in the exponential function -// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) { - const int row = blockDim.y*blockIdx.y + threadIdx.y; - const int block_size = blockDim.x; - const int tid = threadIdx.x; + const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int block_size = blockDim.y; + const int tid = threadIdx.y; + + float max_val = -INFINITY; - float tmp = 0.0; + for (int col = tid; col < ncols; col += block_size) { + const int i = row*ncols + col; + max_val = max(max_val, x[i]); + } - for (int block_start = 0; block_start < ncols; block_start += block_size) { - const int col = block_start + tid; + // find the max value in the block +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32)); + } - if (col >= ncols) { - break; - } + float tmp = 0.f; + for (int col = tid; col < ncols; col += block_size) { const int i = row*ncols + col; - const float val = expf(x[i]); + const float val = expf(x[i] - max_val); tmp += val; dst[i] = val; } @@ -3658,15 +4506,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } - for (int block_start = 0; block_start < ncols; block_start += block_size) { - const int col = block_start + tid; - - if (col >= ncols) { - break; - } + const float inv_tmp = 1.f / tmp; + for (int col = tid; col < ncols; col += block_size) { const int i = row*ncols + col; - dst[i] /= tmp; + dst[i] *= inv_tmp; } } @@ -3707,14 +4551,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % WARP_SIZE == 0); - const dim3 block_dims(WARP_SIZE, 1, 1); - norm_f32<<>>(x, dst, ncols); + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + norm_f32<<>>(x, dst, ncols); + } else { + const dim3 block_dims(1024, 1, 1); + norm_f32<1024><<>>(x, dst, ncols); + } } static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { GGML_ASSERT(ncols % WARP_SIZE == 0); - const dim3 block_dims(WARP_SIZE, 1, 1); - rms_norm_f32<<>>(x, dst, ncols, eps); + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + rms_norm_f32<<>>(x, dst, ncols, eps); + } else { + const dim3 block_dims(1024, 1, 1); + rms_norm_f32<1024><<>>(x, dst, ncols, eps); + } } static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) { @@ -4014,17 +4868,44 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_0_RDNA2; + mmq_y = MMQ_Y_Q4_0_RDNA2; + nwarps = NWARPS_Q4_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_0_RDNA1; + mmq_y = MMQ_Y_Q4_0_RDNA1; + nwarps = NWARPS_Q4_0_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_0_AMPERE; + mmq_y = MMQ_Y_Q4_0_AMPERE; + nwarps = NWARPS_Q4_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_0_PASCAL; + mmq_y = MMQ_Y_Q4_0_PASCAL; + nwarps = NWARPS_Q4_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q4_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4032,17 +4913,44 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_1_RDNA2; + mmq_y = MMQ_Y_Q4_1_RDNA2; + nwarps = NWARPS_Q4_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_1_RDNA1; + mmq_y = MMQ_Y_Q4_1_RDNA1; + nwarps = NWARPS_Q4_1_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_1_AMPERE; + mmq_y = MMQ_Y_Q4_1_AMPERE; + nwarps = NWARPS_Q4_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_1_PASCAL; + mmq_y = MMQ_Y_Q4_1_PASCAL; + nwarps = NWARPS_Q4_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q4_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4050,17 +4958,44 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); - - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_0_RDNA2; + mmq_y = MMQ_Y_Q5_0_RDNA2; + nwarps = NWARPS_Q5_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_0_RDNA1; + mmq_y = MMQ_Y_Q5_0_RDNA1; + nwarps = NWARPS_Q5_0_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_0_AMPERE; + mmq_y = MMQ_Y_Q5_0_AMPERE; + nwarps = NWARPS_Q5_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_0_PASCAL; + mmq_y = MMQ_Y_Q5_0_PASCAL; + nwarps = NWARPS_Q5_0_PASCAL; } else { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q5_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4068,17 +5003,44 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_1_RDNA2; + mmq_y = MMQ_Y_Q5_1_RDNA2; + nwarps = NWARPS_Q5_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_1_RDNA1; + mmq_y = MMQ_Y_Q5_1_RDNA1; + nwarps = NWARPS_Q5_1_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_1_AMPERE; + mmq_y = MMQ_Y_Q5_1_AMPERE; + nwarps = NWARPS_Q5_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_1_PASCAL; + mmq_y = MMQ_Y_Q5_1_PASCAL; + nwarps = NWARPS_Q5_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q5_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4086,17 +5048,44 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q8_0_RDNA2; + mmq_y = MMQ_Y_Q8_0_RDNA2; + nwarps = NWARPS_Q8_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q8_0_RDNA1; + mmq_y = MMQ_Y_Q8_0_RDNA1; + nwarps = NWARPS_Q8_0_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q8_0_AMPERE; + mmq_y = MMQ_Y_Q8_0_AMPERE; + nwarps = NWARPS_Q8_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q8_0_PASCAL; + mmq_y = MMQ_Y_Q8_0_PASCAL; + nwarps = NWARPS_Q8_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q8_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q8_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4104,17 +5093,44 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q2_K_RDNA2; + mmq_y = MMQ_Y_Q2_K_RDNA2; + nwarps = NWARPS_Q2_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q2_K_RDNA1; + mmq_y = MMQ_Y_Q2_K_RDNA1; + nwarps = NWARPS_Q2_K_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q2_K_AMPERE; + mmq_y = MMQ_Y_Q2_K_AMPERE; + nwarps = NWARPS_Q2_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q2_K_PASCAL; + mmq_y = MMQ_Y_Q2_K_PASCAL; + nwarps = NWARPS_Q2_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q2_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q2_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4122,35 +5138,92 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; +#if QK_K == 256 + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q3_K_RDNA2; + mmq_y = MMQ_Y_Q3_K_RDNA2; + nwarps = NWARPS_Q3_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q3_K_RDNA1; + mmq_y = MMQ_Y_Q3_K_RDNA1; + nwarps = NWARPS_Q3_K_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q3_K_AMPERE; + mmq_y = MMQ_Y_Q3_K_AMPERE; + nwarps = NWARPS_Q3_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q3_K_PASCAL; + mmq_y = MMQ_Y_Q3_K_PASCAL; + nwarps = NWARPS_Q3_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q3_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q3_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } +#endif } static void ggml_mul_mat_q4_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_K_RDNA2; + mmq_y = MMQ_Y_Q4_K_RDNA2; + nwarps = NWARPS_Q4_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_K_RDNA1; + mmq_y = MMQ_Y_Q4_K_RDNA1; + nwarps = NWARPS_Q4_K_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_K_AMPERE; + mmq_y = MMQ_Y_Q4_K_AMPERE; + nwarps = NWARPS_Q4_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_K_PASCAL; + mmq_y = MMQ_Y_Q4_K_PASCAL; + nwarps = NWARPS_Q4_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q4_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4158,17 +5231,44 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_K_RDNA2; + mmq_y = MMQ_Y_Q5_K_RDNA2; + nwarps = NWARPS_Q5_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_K_RDNA1; + mmq_y = MMQ_Y_Q5_K_RDNA1; + nwarps = NWARPS_Q5_K_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_K_AMPERE; + mmq_y = MMQ_Y_Q5_K_AMPERE; + nwarps = NWARPS_Q5_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_K_PASCAL; + mmq_y = MMQ_Y_Q5_K_PASCAL; + nwarps = NWARPS_Q5_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q5_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4176,17 +5276,44 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q6_K_RDNA2; + mmq_y = MMQ_Y_Q6_K_RDNA2; + nwarps = NWARPS_Q6_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q6_K_RDNA1; + mmq_y = MMQ_Y_Q6_K_RDNA1; + nwarps = NWARPS_Q6_K_RDNA1; + } else if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q6_K_AMPERE; + mmq_y = MMQ_Y_Q6_K_AMPERE; + nwarps = NWARPS_Q6_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q6_K_PASCAL; + mmq_y = MMQ_Y_Q6_K_PASCAL; + nwarps = NWARPS_Q6_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q6_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q6_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4236,31 +5363,50 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { - GGML_ASSERT(nrows % 2 == 0); - const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1); + GGML_ASSERT(ncols % 2 == 0); + const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); - const dim3 block_nums(num_blocks_x, nrows, 1); + const dim3 block_nums(nrows, num_blocks_x, 1); rope_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); } -static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { - GGML_ASSERT(nrows % 4 == 0); - const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1); - const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE); +static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { + GGML_ASSERT(ncols % 2 == 0); + const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(nrows, num_blocks_x, 1); + rope_neox_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); +} + +static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, + const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) { + GGML_ASSERT(ncols % 4 == 0); + const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); + const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; const dim3 block_nums(num_blocks_x, nrows, 1); - rope_glm_f32<<>>(x, dst, ncols, p, block_p, theta_scale); + rope_glm_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx); +} + +static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, + const int k_rows, const int n_heads_log2_floor, const float m0, + const float m1, cudaStream_t stream) { + const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1); + const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE); + const dim3 block_nums(num_blocks_x, nrows, 1); + alibi_f32<<>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1); } static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { - const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1); + const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; - const dim3 block_nums(block_num_x, nrows_x, 1); + const dim3 block_nums(nrows_x, block_num_x, 1); diag_mask_inf_f32<<>>(x, dst, ncols_x, rows_per_channel, n_past); } static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) { - const dim3 block_dims(WARP_SIZE, 1, 1); - const dim3 block_nums(1, nrows_x, 1); + const dim3 block_dims(1, WARP_SIZE, 1); + const dim3 block_nums(nrows_x, 1, 1); soft_max_f32<<>>(x, dst, ncols_x); } @@ -4288,7 +5434,6 @@ struct cuda_buffer { static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; -static bool g_mul_mat_q = false; static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); @@ -4358,46 +5503,46 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { } -static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default -static size_t g_scratch_offset = 0; - -static int g_device_count = -1; -static int g_main_device = 0; -static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; -static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; - -static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - -static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; - void ggml_init_cublas() { static bool initialized = false; if (!initialized) { + +#ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: + // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; - fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count); - for (int id = 0; id < g_device_count; ++id) { + fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); + for (int64_t id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; - +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; +#else g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } - for (int id = 0; id < g_device_count; ++id) { + for (int64_t id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; } - for (int id = 0; id < g_device_count; ++id) { - CUDA_CHECK(cudaSetDevice(id)); + for (int64_t id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); - // create main stream - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking)); + // create cuda streams + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking)); + } // create cublas handle CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); @@ -4466,7 +5611,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( if (src->backend == GGML_BACKEND_CPU) { kind = cudaMemcpyHostToDevice; src_ptr = (char *) src->data; - } else if (src->backend == GGML_BACKEND_GPU) { + } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = cudaMemcpyDeviceToDevice; struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -4505,389 +5651,366 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( } inline void ggml_cuda_op_add( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ - - GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + GGML_ASSERT(src1->type == GGML_TYPE_F32); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - // compute if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); + add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main); + add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream); } else { GGML_ASSERT(false); } (void) src1; (void) dst; - (void) src0_ddq_i; - (void) i02; - (void) i1; } inline void ggml_cuda_op_mul( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); - - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); + mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream); (void) dst; - (void) src0_ddq_i; - (void) i02; - (void) i1; } inline void ggml_cuda_op_gelu( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; - - // compute - gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_silu( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ - - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - // compute - silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_norm( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); - // compute - norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_rms_norm( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); float eps; memcpy(&eps, dst->op_params, sizeof(float)); - // compute - rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main); + rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_mul_mat_q( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ - - GGML_ASSERT(src0_ddq_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { const int64_t ne00 = src0->ne[0]; const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; GGML_ASSERT(ne10 % QK8_1 == 0); const int64_t ne0 = dst->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t row_diff = row_high - row_low; int id; CUDA_CHECK(cudaGetDevice(&id)); // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff; - - const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ? - ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; - size_t as; - void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as); - quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main); + const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: - ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q4_1: - ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q5_0: - ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q5_1: - ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q8_0: - ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q2_K: - ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q3_K: - ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q4_K: - ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q5_K: - ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; case GGML_TYPE_Q6_K: - ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main); + ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; default: GGML_ASSERT(false); break; } - ggml_cuda_pool_free(src1_q8_1, as); - (void) src1; (void) dst; - (void) src0_ddf_i; - (void) i02; - (void) i1; + (void) src1_ddf_i; } -inline void ggml_cuda_op_mul_mat_vec( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ +static int64_t get_row_rounding(ggml_type type) { + int64_t min_compute_capability = INT_MAX; + int64_t max_compute_capability = INT_MIN; + for (int64_t id = 0; id < g_device_count; ++id) { + if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + if (min_compute_capability > g_compute_capabilities[id]) { + min_compute_capability = g_compute_capabilities[id]; + } + if (max_compute_capability < g_compute_capabilities[id]) { + max_compute_capability = g_compute_capabilities[id]; + } + } + } - GGML_ASSERT(src0_ddq_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + case GGML_TYPE_F16: + return 1; + case GGML_TYPE_Q2_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 32; + case GGML_TYPE_Q3_K: + return min_compute_capability < CC_RDNA2 ? 128 : 64; + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + default: + GGML_ASSERT(false); + } +#else + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ASSERT(false); + } +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +} + +inline void ggml_cuda_op_mul_mat_vec_q( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { const int64_t ne00 = src0->ne[0]; - const int64_t nrows = i01_high - i01_low; + const int64_t row_diff = row_high - row_low; -#ifdef GGML_CUDA_FORCE_DMMV - const bool use_mul_mat_vec_q = false; - (void) g_compute_capabilities[0]; -#else - int id; - CUDA_CHECK(cudaGetDevice(&id)); + switch (src0->type) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } - bool mul_mat_vec_q_implemented = - src0->type == GGML_TYPE_Q4_0 || - src0->type == GGML_TYPE_Q4_1 || - src0->type == GGML_TYPE_Q5_0 || - src0->type == GGML_TYPE_Q5_1 || - src0->type == GGML_TYPE_Q8_0; -#if QK_K == 256 - mul_mat_vec_q_implemented = mul_mat_vec_q_implemented || - src0->type == GGML_TYPE_Q2_K || - src0->type == GGML_TYPE_Q3_K || - src0->type == GGML_TYPE_Q4_K || - src0->type == GGML_TYPE_Q5_K || - src0->type == GGML_TYPE_Q6_K; -#endif // QK_K == 256 - - const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented; -#endif + (void) src1; + (void) dst; + (void) src1_ddf_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} - if (use_mul_mat_vec_q) { - const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ? - ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; - size_t as; - void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); - quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main); - - switch (src0->type) { - case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - default: - GGML_ASSERT(false); - break; - } +inline void ggml_cuda_op_dequantize_mul_mat_vec( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { - ggml_cuda_pool_free(src1_q8_1, as); - } else { - // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics #ifdef GGML_CUDA_F16 - size_t ash; - dfloat * src1_dfloat = nullptr; // dfloat == half - - bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || - src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || - src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; - - if (src1_convert_f16) { - src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); - ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00, - ne00, 1, sizeof(float), 0, 0, - ne00, 1, sizeof(half), 0, 0, cudaStream_main); - } + size_t ash; + dfloat * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); + ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, + ne00, 1, sizeof(float), 0, 0, + ne00, 1, sizeof(half), 0, 0, stream); + } #else - dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion #endif // GGML_CUDA_F16 - switch (src0->type) { - case GGML_TYPE_Q4_0: - dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q4_1: - dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_0: - dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_1: - dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q8_0: - dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q2_K: - dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q3_K: - dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q4_K: - dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q5_K: - dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_Q6_K: - dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - case GGML_TYPE_F16: - convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); - break; - default: - GGML_ASSERT(false); - break; - } + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } #ifdef GGML_CUDA_F16 - if (src1_convert_f16) { - ggml_cuda_pool_free(src1_dfloat, ash); - } -#endif // GGML_CUDA_F16 + if (src1_convert_f16) { + ggml_cuda_pool_free(src1_dfloat, ash); } +#endif // GGML_CUDA_F16 (void) src1; (void) dst; - (void) src0_ddf_i; - (void) i02; - (void) i1; + (void) src1_ddq_i; + (void) src1_ncols; + (void) src1_padded_row_size; } inline void ggml_cuda_op_mul_mat_cublas( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, const cudaStream_t & stream) { - GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(src0_dd_i != nullptr); GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(dst_dd_i != nullptr); const float alpha = 1.0f; const float beta = 0.0f; @@ -4895,43 +6018,54 @@ inline void ggml_cuda_op_mul_mat_cublas( const int64_t ne00 = src0->ne[0]; const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; const int64_t ne0 = dst->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t row_diff = row_high - row_low; + + float * src0_ddq_as_f32; + size_t src0_as = 0; + + if (src0->type != GGML_TYPE_F32) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT + to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); + } + const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; int id; CUDA_CHECK(cudaGetDevice(&id)); // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff; + int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; - CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main)); + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); CUBLAS_CHECK( cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, - i01_diff, ne11, ne10, + row_diff, src1_ncols, ne10, &alpha, src0_ddf_i, ne00, - src1_ddf_i, ne10, - &beta, dst_ddf_i, ldc)); + src1_ddf_i, ne10, + &beta, dst_dd_i, ldc)); + + if (src0_as > 0) { + ggml_cuda_pool_free(src0_ddq_as_f32, src0_as); + } (void) dst; - (void) src0_ddq_i; - (void) i02; - (void) i1; + (void) src1_ddq_i; + (void) src1_padded_row_size; } inline void ggml_cuda_op_rope( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; @@ -4944,344 +6078,438 @@ inline void ggml_cuda_op_rope( memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; - const bool is_glm = mode & 4; + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; // compute if (is_glm) { - const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale; - const float id_p = min(p, n_ctx - 2.f); - const float block_p = max(p - (n_ctx - 2.f), 0.f); - rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); + rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream); + } else if (is_neox) { + GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"); + rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); } else { - const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; - rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); + rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); } (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i1; + (void) src1_dd; +} + +inline void ggml_cuda_op_alibi( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t nrows = ggml_nrows(src0); + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + GGML_ASSERT(ne01 + n_past == ne00); + GGML_ASSERT(n_head == ne02); + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream); + + (void) src1; + (void) src1_dd; } inline void ggml_cuda_op_diag_mask_inf( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t i01_diff = i01_high - i01_low; + const int nrows0 = ggml_nrows(src0); const int n_past = ((int32_t *) dst->op_params)[0]; - // compute - diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main); + diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_soft_max( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; + const int64_t nrows = ggml_nrows(src0); - // compute - soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } inline void ggml_cuda_op_scale( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, - float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, - cudaStream_t & cudaStream_main){ + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { - GGML_ASSERT(src0_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); const float scale = ((float *) src1->data)[0]; - const int64_t ne00 = src0->ne[0]; - const int64_t i01_diff = i01_high - i01_low; - - // compute - scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main); + scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); (void) src1; (void) dst; - (void) src0_ddq_i; - (void) src1_ddf_i; - (void) i02; - (void) i1; + (void) src1_dd; } -static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) { +static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) { + const int64_t nrows0 = ggml_nrows(src0); + + const bool use_src1 = src1 != nullptr; + const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + + const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE; + + // dd = data device + float * src0_ddf = nullptr; + float * src1_ddf = nullptr; + float * dst_ddf = nullptr; + + // as = actual size + size_t src0_asf = 0; + size_t src1_asf = 0; + size_t dst_asf = 0; + + ggml_cuda_set_device(g_main_device); + const cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + if (src0_on_device) { + src0_ddf = (float *) src0_extra->data_device[g_main_device]; + } else { + src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); + } + + if (use_src1 && !src1_stays_on_host) { + if (src1_on_device) { + src1_ddf = (float *) src1_extra->data_device[g_main_device]; + } else { + src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); + } + } + if (dst_on_device) { + dst_ddf = (float *) dst_extra->data_device[g_main_device]; + } else { + dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf); + } + + // do the computation + op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); + CUDA_CHECK(cudaGetLastError()); + + // copy dst to host if necessary + if (!dst_on_device) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); + } + + if (src0_asf > 0) { + ggml_cuda_pool_free(src0_ddf, src0_asf); + } + if (src1_asf > 0) { + ggml_cuda_pool_free(src1_ddf, src1_asf); + } + if (dst_asf > 0) { + ggml_cuda_pool_free(dst_ddf, dst_asf); + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(cudaDeviceSynchronize()); + } +} + +void ggml_cuda_set_peer_access(const int n_tokens) { + static bool peer_access_enabled = false; + + const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE; + + if (peer_access_enabled == enable_peer_access) { + return; + } + +#ifdef NDEBUG + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + + for (int id_other = 0; id_other < g_device_count; ++id_other) { + if (id == id_other) { + continue; + } + if (id != g_main_device && id_other != g_main_device) { + continue; + } + + int can_access_peer; + CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other)); + if (can_access_peer) { + if (enable_peer_access) { + CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0)); + } else { + CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other)); + } + } + } + } +#endif // NDEBUG + + peer_access_enabled = enable_peer_access; +} + +static void ggml_cuda_op_mul_mat( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, + const bool convert_src1_to_q8_1) { + const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; const int64_t nrows0 = ggml_nrows(src0); - const bool use_src1 = src1 != nullptr; - const int64_t ne10 = use_src1 ? src1->ne[0] : 1; - const int64_t ne11 = use_src1 ? src1->ne[1] : 1; - const int64_t ne12 = use_src1 ? src1->ne[2] : 1; - const int64_t ne13 = use_src1 ? src1->ne[3] : 1; - const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + const int64_t nrows1 = ggml_nrows(src1); GGML_ASSERT(ne03 == ne13); const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + ggml_cuda_set_peer_access(ne11); GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); - // strides for iteration over dims 3 and 2 - const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13; - const int64_t num_iters = flatten_rows ? 1 : num_iters_0; - const int64_t stride_mod = flatten_rows ? num_iters_0 : 1; - const int64_t src0_stride = ne00 * ne01 * stride_mod; - const int64_t src1_stride = ne10 * ne11 * stride_mod; - const int64_t dst_stride = ne0 * ne1 * stride_mod; + GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); - const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01; - const int64_t i03_max = flatten_rows ? 1 : ne03; - const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12); - const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02; - GGML_ASSERT(!(flatten_rows && ne02 < ne12)); + const int64_t i02_divisor = ne12 / ne02; const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type); + const size_t q8_1_ts = sizeof(block_q8_1); + const size_t q8_1_bs = QK8_1; - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); - const bool src0_is_f32 = src0->type == GGML_TYPE_F32; - const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1); - const bool src1_stays_on_host = use_src1 && ( - dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE); + const bool src1_is_contiguous = ggml_is_contiguous(src1); + const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ? + ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + GGML_ASSERT(!(split && ne02 > 1)); + GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); - // dd = data device - char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized - float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float - float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; - float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; - - // asq = actual size quantized, asf = actual size float - size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0}; - size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0}; - size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; - size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0}; + char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr}; + float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float + char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1 + float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr}; - // if multiple devices are used they need to wait for the main device - // here an event is recorded that signifies that the main device has finished calculating the input data - if (split && g_device_count > 1) { - CUDA_CHECK(cudaSetDevice(g_main_device)); - CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device])); - } + // as = actual size + size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0}; + size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0}; - for (int id = 0; id < g_device_count; ++id) { - if (!split && id != g_main_device) { - continue; - } + int64_t row_low[GGML_CUDA_MAX_DEVICES]; + int64_t row_high[GGML_CUDA_MAX_DEVICES]; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + for (int64_t id = 0; id < g_device_count; ++id) { + // by default, use all rows + row_low[id] = 0; + row_high[id] = ne01; - int64_t row_low, row_high; + // for multi GPU, get the row boundaries from tensor split + // and round to mul_mat_q tile sizes if (split) { - row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_MMQ_Y; + const int64_t rounding = get_row_rounding(src0->type); - if (id == g_device_count - 1) { - row_high = nrows0; - } else { - row_high = nrows0*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_MMQ_Y; + if (id != 0) { + row_low[id] = ne01*g_tensor_split[id]; + row_low[id] -= row_low[id] % rounding; + } + + if (id != g_device_count - 1) { + row_high[id] = ne01*g_tensor_split[id + 1]; + row_high[id] -= row_high[id] % rounding; } - } else { - row_low = 0; - row_high = nrows0*i02_divisor; } - if (row_low == row_high) { + } + + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { continue; } - int64_t row_diff = row_high - row_low; + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; - cudaSetDevice(id); - cudaStream_t cudaStream_main = g_cudaStreams_main[id]; - - // wait for main GPU data if necessary - if (split && id != g_main_device) { - CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device])); - } + ggml_cuda_set_device(id); + const cudaStream_t stream = g_cudaStreams[id][0]; if (src0_on_device && src0_is_contiguous) { - if (src0_is_f32) { - src0_ddf[id] = (float *) src0_extra->data_device[id]; - } else { - src0_ddq[id] = (char *) src0_extra->data_device[id]; - } + src0_dd[id] = (char *) src0_extra->data_device[id]; } else { - if (src0_is_f32) { - src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); - } else { - src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]); - } + const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); + src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]); } - if (src0_needs_f32 && !src0_is_f32) { - src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); + if (src1_on_device && src1_is_contiguous) { + src1_ddf[id] = (float *) src1_extra->data_device[id]; + } else { + src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]); } - if (use_src1 && !src1_stays_on_host) { - if (src1_on_device && src1_is_contiguous) { - src1_ddf[id] = (float *) src1_extra->data_device[id]; - } else { - src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]); + if (convert_src1_to_q8_1) { + src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); + + if (split && src1_on_device && src1_is_contiguous) { + quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); + CUDA_CHECK(cudaGetLastError()); } } + if (dst_on_device) { - dst_ddf[id] = (float *) dst_extra->data_device[id]; + dst_dd[id] = (float *) dst_extra->data_device[id]; } else { - size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float); - dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]); + const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst); + dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]); } + } - for (int64_t i03 = 0; i03 < i03_max; i03++) { - const int64_t i13 = i03 % ne13; - for (int64_t i02 = 0; i02 < i02_max; i02++) { - const int64_t i12 = i02 % ne12; + // if multiple devices are used they need to wait for the main device + // here an event is recorded that signals that the main device has finished calculating the input data + if (split && g_device_count > 1) { + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0])); + } - const int64_t i0 = i03*i02_max + i02; + const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; + for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { + const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; + const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; - // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs - const int64_t i0_offset_low = row_low/rows_per_iter; - const int64_t i0_offset_high = row_high/rows_per_iter; + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } - int64_t i01_low = 0; - int64_t i01_high = rows_per_iter; - if (split) { - if (i0 < i0_offset_low || i0 > i0_offset_high) { - continue; - } - if (i0 == i0_offset_low) { - i01_low = row_low % rows_per_iter; - } - if (i0 == i0_offset_high) { - i01_high = row_high % rows_per_iter; - } - } + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const int64_t row_diff = row_high[id] - row_low[id]; - // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables. - // Removing the first assert or changing the order of the arguments causes the second assert to fail. - // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output. - // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU). - GGML_ASSERT(i01_low == 0 || g_device_count > 1); - GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1); + ggml_cuda_set_device(id); + const cudaStream_t stream = g_cudaStreams[id][is]; - const int64_t i01_diff = i01_high - i01_low; - if (i01_diff == 0) { - continue; - } - const int64_t i11 = i13*ne12 + i12; + // wait for main GPU data if necessary + if (split && (id != g_main_device || is != 0)) { + CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0)); + } + + for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { + const int64_t i03 = i0 / ne12; + const int64_t i02 = i0 % ne12; + + const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; // for split tensors the data begins at i0 == i0_offset_low - char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs; - float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride; - float * src1_ddf_i = src1_ddf[id] + i11*src1_stride; - float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; - - // for split tensors the data pointer needs to be rounded down - // to the bin edge for i03, i02 bins beyond the first - if (i0 - i0_offset_low > 0) { - GGML_ASSERT(!flatten_rows); - src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs; - src0_ddf_i -= (row_low % ne01)*ne00; - dst_ddf_i -= (row_low % ne0)*ne1; - } + char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs; + float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10; + char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset; + float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { - dst_ddf_i += i01_low; // offset is 0 if no tensor split + dst_dd_i += row_low[id]; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (use_src1 && !src1_stays_on_host) { - if (src1->backend == GGML_BACKEND_CPU) { - GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1)); - int64_t nrows1 = flatten_rows ? nrows0 : ne11; - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main)); - } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { - if (id != g_main_device) { - GGML_ASSERT(!flatten_rows); + if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (id != g_main_device) { + if (convert_src1_to_q8_1) { + char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset; + CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, + cudaMemcpyDeviceToDevice, stream)); + } else { float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; - src1_ddf_i_source += i11*src1_stride; - CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float), - cudaMemcpyDeviceToDevice, cudaStream_main)); + src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; + CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float), + cudaMemcpyDeviceToDevice, stream)); } - } else if (src1_on_device && !src1_is_contiguous) { - GGML_ASSERT(!split); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main)); - } else { - GGML_ASSERT(false); } + } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d( + src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); + } else { + GGML_ASSERT(false); } - if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { - if (src0_is_f32) { - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); - } else { - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); - } + if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) { + quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); + CUDA_CHECK(cudaGetLastError()); } - // convert src0 to f32 if it is necessary for the ggml_cuda_op - if (src0_needs_f32 && !src0_is_f32) { - to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main); - CUDA_CHECK(cudaGetLastError()); + if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream)); } // do the computation - op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main); + op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, + row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); // copy dst to host or other device if necessary @@ -5303,95 +6531,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. // Instead they need to be copied to the correct slice in ne0 = dst row index. // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. - float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float), - i01_diff*sizeof(float), ne1, kind, cudaStream_main)); + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0 + row_low[id]; + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float), + row_diff*sizeof(float), src1_ncols, kind, stream)); } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main)); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0; + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream)); } } - // signify to main device that other device is done - if (split && g_device_count > 1 && id != g_main_device) { - CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main)); + // add event for the main device to wait on until other device is done + if (split && (id != g_main_device || is != 0)) { + CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream)); } } } } - // wait until each device is finished, then free their buffers - for (int id = 0; id < g_device_count; ++id) { - if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) { - continue; - } - - CUDA_CHECK(cudaSetDevice(id)); + for (int64_t id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); - if (src0_asq[id] > 0) { - ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]); - } - if (src0_asf[id] > 0) { - ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]); + // free buffers again when done + if (src0_as[id] > 0) { + ggml_cuda_pool_free(src0_dd[id], src0_as[id]); } if (src1_asf[id] > 0) { ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]); } - if (dst_asf[id] > 0) { - ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]); + if (src1_asq[id] > 0) { + ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]); + } + if (dst_as[id] > 0) { + ggml_cuda_pool_free(dst_dd[id], dst_as[id]); } } // main device waits for all other devices to be finished if (split && g_device_count > 1) { - CUDA_CHECK(cudaSetDevice(g_main_device)); - for (int id = 0; id < g_device_count; ++id) { - if (id != g_main_device && src0_extra->events[id]) { - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id])); + int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; + is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + for (int64_t id = 0; id < g_device_count; ++id) { + for (int64_t is = 0; is < is_max; ++is) { + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0)); } } } if (dst->backend == GGML_BACKEND_CPU) { - CUDA_CHECK(cudaSetDevice(g_main_device)); + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(cudaDeviceSynchronize()); } } void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op. - // Due to flatten_rows == true this does in practice not make a difference however. - // Better solution would be nice but right now that would require disproportionate changes. - GGML_ASSERT( - (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && - src1->type == GGML_TYPE_F32 && - (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16)); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); } void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul); } void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu); } void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu); } void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm); } void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { @@ -5425,8 +6644,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne12 = src1->ne[2]; - CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; @@ -5437,7 +6656,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; - ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main); + ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); } void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ @@ -5456,8 +6675,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; - CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; @@ -5468,38 +6687,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; - const int row_stride_x = nb01 / sizeof(half); - const int channel_stride_x = nb02 / sizeof(half); + const int64_t row_stride_x = nb01 / sizeof(half); + const int64_t channel_stride_x = nb02 / sizeof(half); - ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main); + ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; + int64_t min_compute_capability = INT_MAX; + for (int64_t id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_compute_capabilities[id] + && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + min_compute_capability = g_compute_capabilities[id]; + } + } + if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec_p021(src0, src1, dst); } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec_nc(src0, src1, dst); }else if (src0->type == GGML_TYPE_F32) { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false); - } else { - int min_compute_capability = INT_MAX; - for (int id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_compute_capabilities[id] - && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { - min_compute_capability = g_compute_capabilities[id]; - } - } +#ifdef GGML_CUDA_FORCE_DMMV + const bool use_mul_mat_vec_q = false; +#else + const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); +#endif // GGML_CUDA_FORCE_DMMV + + if (use_mul_mat_vec_q) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); + } else { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + } + } else { if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false); + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } } } else { @@ -5508,8 +6738,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ } void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); } void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5538,8 +6767,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens const int64_t nb11 = src1->nb[1]; const int64_t nb12 = src1->nb[2]; - CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; @@ -5549,10 +6778,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, cudaStream_main); + ne10, ne11, nb10, nb11, nb12, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, cudaStream_main); + ne10, ne11, nb10, nb11, nb12, main_stream); } else { GGML_ASSERT(false); } @@ -5566,21 +6795,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens } void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf); } void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max); } void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope); +} - const int mode = ((int32_t *) dst->op_params)[2]; - const bool is_glm = mode & 4; - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm +void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); } void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5590,7 +6818,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens } void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { - int nrows = ggml_nrows(tensor); + const int64_t nrows = ggml_nrows(tensor); const int64_t ne0 = tensor->ne[0]; @@ -5600,26 +6828,28 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); - for (int id = 0; id < g_device_count; ++id) { + for (int64_t id = 0; id < g_device_count; ++id) { if (backend == GGML_BACKEND_GPU && id != g_main_device) { continue; } - cudaSetDevice(id); + ggml_cuda_set_device(id); - int row_low, row_high; + int64_t row_low, row_high; if (backend == GGML_BACKEND_GPU) { row_low = 0; row_high = nrows; } else if (backend == GGML_BACKEND_GPU_SPLIT) { + const int64_t rounding = get_row_rounding(tensor->type); + row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_MMQ_Y; + row_low -= row_low % rounding; if (id == g_device_count - 1) { row_high = nrows; } else { row_high = nrows*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_MMQ_Y; + row_high -= row_high % rounding; } } else { GGML_ASSERT(false); @@ -5655,7 +6885,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { extra->data_device[id] = buf; if (backend == GGML_BACKEND_GPU_SPLIT) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming)); + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + } } } @@ -5669,15 +6901,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - for (int id = 0; id < g_device_count; ++id) { + for (int64_t id = 0; id < g_device_count; ++id) { if (extra->data_device[id] != nullptr) { - CUDA_CHECK(cudaSetDevice(id)); + CUDA_CHECK(ggml_cuda_set_device(id)); CUDA_CHECK(cudaFree(extra->data_device[id])); } - if (extra->events[id] != nullptr) { - CUDA_CHECK(cudaSetDevice(id)); - CUDA_CHECK(cudaEventDestroy(extra->events[id])); + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + if (extra->events[id][is] != nullptr) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + } } } @@ -5700,7 +6934,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { return extra; } -void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) { +void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) { if (scratch && g_scratch_size == 0) { return; } @@ -5709,14 +6943,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { const ggml_op src0_op = tensor->src[0]->op; if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { - ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace); + ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); } } if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { - ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace); + ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); } tensor->backend = GGML_BACKEND_GPU; + + if (scratch && no_alloc) { + return; + } + struct ggml_tensor_extra_gpu * extra; const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || @@ -5724,7 +6963,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo force_inplace; const size_t size = ggml_nbytes(tensor); - CUDA_CHECK(cudaSetDevice(g_main_device)); + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; @@ -5768,19 +7007,52 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo tensor->extra = extra; } +void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) { + if (g_scratch_size == 0) { + return; + } + if (g_scratch_buffer == nullptr) { + ggml_cuda_set_device(g_main_device); + CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); + } + + struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); + + const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || + tensor->op == GGML_OP_VIEW; + + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + size_t view_offset = 0; + if (tensor->op == GGML_OP_VIEW) { + memcpy(&view_offset, tensor->op_params, sizeof(size_t)); + } + extra->data_device[g_main_device] = src0_ddc + view_offset; + } else { + extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset; + } + + tensor->extra = extra; +} + void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, true, false); + ggml_cuda_assign_buffers_impl(tensor, true, false, false); +} + +void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, true, false, true); } void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false, false); + ggml_cuda_assign_buffers_impl(tensor, false, false, false); } void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false, true); + ggml_cuda_assign_buffers_impl(tensor, false, true, false); } -void ggml_cuda_set_main_device(int main_device) { +void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -5794,11 +7066,11 @@ void ggml_cuda_set_main_device(int main_device) { } } -void ggml_cuda_set_mul_mat_q(bool mul_mat_q) { +void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) { g_mul_mat_q = mul_mat_q; } -void ggml_cuda_set_scratch_size(size_t scratch_size) { +void ggml_cuda_set_scratch_size(const size_t scratch_size) { g_scratch_size = scratch_size; } @@ -5916,6 +7188,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func = ggml_cuda_rope; break; + case GGML_OP_ALIBI: + if (!any_on_device) { + return false; + } + func = ggml_cuda_alibi; + break; default: return false; } @@ -5929,3 +7207,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func(tensor->src[0], tensor->src[1], tensor); return true; } + +int ggml_cuda_get_device_count() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + return device_count; +} + +void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + snprintf(description, description_size, "%s", prop.name); +} diff --git a/ggml-cuda.h b/ggml-cuda.h index 72d7afa463d741498af0063f0ccf14e5b9028bf9..a72e82069b9f1430fdbcc007f93ae26ea6c2f924 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -2,35 +2,44 @@ #include "ggml.h" +#ifdef GGML_USE_HIPBLAS +#define GGML_CUDA_NAME "ROCm" +#define GGML_CUBLAS_NAME "hipBLAS" +#else +#define GGML_CUDA_NAME "CUDA" +#define GGML_CUBLAS_NAME "cuBLAS" +#endif + #ifdef __cplusplus extern "C" { #endif #define GGML_CUDA_MAX_DEVICES 16 -void ggml_init_cublas(void); -void ggml_cuda_set_tensor_split(const float * tensor_split); - -void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); - -// TODO: export these with GGML_API -void * ggml_cuda_host_malloc(size_t size); -void ggml_cuda_host_free(void * ptr); - -void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); - -void ggml_cuda_free_data(struct ggml_tensor * tensor); -void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); -void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); -void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); -void ggml_cuda_set_main_device(int main_device); -void ggml_cuda_set_mul_mat_q(bool mul_mat_q); -void ggml_cuda_set_scratch_size(size_t scratch_size); -void ggml_cuda_free_scratch(void); -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +GGML_API void ggml_init_cublas(void); +GGML_API void * ggml_cuda_host_malloc(size_t size); +GGML_API void ggml_cuda_host_free(void * ptr); + +GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); +GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); +GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); + +GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); + +GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); + +GGML_API void ggml_cuda_set_main_device(int main_device); +GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); +GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size); +GGML_API void ggml_cuda_free_scratch(void); +GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + +GGML_API int ggml_cuda_get_device_count(void); +GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); #ifdef __cplusplus } diff --git a/ggml-metal.h b/ggml-metal.h index 16f1a0caacfac483cf2c33e0715ca8d1c61ea7ce..fca28d37ef97069ca7800fc890ea4d013c46437f 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -24,6 +24,7 @@ // max memory buffers that can be mapped to the device #define GGML_METAL_MAX_BUFFERS 16 +#define GGML_METAL_MAX_COMMAND_BUFFERS 32 struct ggml_tensor; struct ggml_cgraph; @@ -38,6 +39,9 @@ struct ggml_metal_context; struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); +void * ggml_metal_host_malloc(size_t n); +void ggml_metal_host_free (void * data); + // set the number of command buffers to use void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); @@ -63,10 +67,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * // try to find operations that can be run concurrently in the graph // you should run it again if the topology of your graph changes -void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); + +// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized +int ggml_metal_if_optimized(struct ggml_metal_context * ctx); -// if the graph has been optimized for concurrently dispatch -bool ggml_metal_if_optimized(struct ggml_metal_context * ctx); +// output the concur_list for ggml_alloc +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // same as ggml_graph_compute but uses Metal // creates gf->n_threads command buffers in parallel diff --git a/ggml-metal.m b/ggml-metal.m index b47a98e214b613fb0f022f8606047e6771201ddd..917089fcbeb23a03cbb8a19dac4a7f611da2864a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -5,13 +5,13 @@ #import #import -#import #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// TODO: temporary - reuse llama.cpp logging #ifdef GGML_METAL_NDEBUG #define metal_printf(...) #else @@ -34,12 +34,15 @@ struct ggml_metal_buffer { struct ggml_metal_context { int n_cb; - float * logits; - id device; id queue; id library; + id command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS]; + id command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS]; + + dispatch_queue_t d_queue; + int n_buffers; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; @@ -60,10 +63,14 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(relu); GGML_METAL_DECL_KERNEL(gelu); GGML_METAL_DECL_KERNEL(soft_max); + GGML_METAL_DECL_KERNEL(soft_max_4); GGML_METAL_DECL_KERNEL(diag_mask_inf); + GGML_METAL_DECL_KERNEL(diag_mask_inf_8); + GGML_METAL_DECL_KERNEL(get_rows_f32); GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_q4_0); GGML_METAL_DECL_KERNEL(get_rows_q4_1); + GGML_METAL_DECL_KERNEL(get_rows_q8_0); GGML_METAL_DECL_KERNEL(get_rows_q2_K); GGML_METAL_DECL_KERNEL(get_rows_q3_K); GGML_METAL_DECL_KERNEL(get_rows_q4_K); @@ -71,14 +78,28 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(get_rows_q6_K); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(norm); + GGML_METAL_DECL_KERNEL(mul_mat_f32_f32); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); + GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); + GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(cpy_f32_f16); @@ -100,33 +121,52 @@ static NSString * const msl_library_source = @"see metal.metal"; @end struct ggml_metal_context * ggml_metal_init(int n_cb) { - fprintf(stderr, "%s: allocating\n", __func__); + metal_printf("%s: allocating\n", __func__); - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + id device; + NSString * s; + +#if TARGET_OS_OSX + // Show all the Metal device instances in the system + NSArray * devices = MTLCopyAllDevices(); + for (device in devices) { + s = [device name]; + metal_printf("%s: found device: %s\n", __func__, [s UTF8String]); + } +#endif + + // Pick and show default Metal device + device = MTLCreateSystemDefaultDevice(); + s = [device name]; + metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]); - ctx->n_cb = n_cb; - ctx->device = MTLCreateSystemDefaultDevice(); + // Configure context + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + ctx->device = device; + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; ctx->concur_list_len = 0; - // determine if we can use MPS - if (MPSSupportsMTLDevice(ctx->device)) { - fprintf(stderr, "%s: using MPS\n", __func__); - } else { - fprintf(stderr, "%s: not using MPS\n", __func__); - GGML_ASSERT(false && "MPS not supported"); - } + ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); -#if 0 - // compile from source string and show compile log +#ifdef GGML_SWIFT + // load the default.metallib file { NSError * error = nil; - ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; + NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; + NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"]; + NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath]; + NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"]; + NSURL * libURL = [NSURL fileURLWithPath:libPath]; + + // Load the metallib file into a Metal library + ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; + if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; } } #else @@ -138,13 +178,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; - NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]); + NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; + metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; } #ifdef GGML_QKK_64 @@ -155,18 +195,25 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); + return NULL; } } #endif // load kernels { + NSError * error = nil; #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \ - fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); + ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ + metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ + (int) ctx->pipeline_##name.threadExecutionWidth); \ + if (error) { \ + metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + return NULL; \ + } GGML_METAL_ADD_KERNEL(add); GGML_METAL_ADD_KERNEL(add_row); @@ -177,10 +224,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(relu); GGML_METAL_ADD_KERNEL(gelu); GGML_METAL_ADD_KERNEL(soft_max); + GGML_METAL_ADD_KERNEL(soft_max_4); GGML_METAL_ADD_KERNEL(diag_mask_inf); + GGML_METAL_ADD_KERNEL(diag_mask_inf_8); + GGML_METAL_ADD_KERNEL(get_rows_f32); GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_q4_0); GGML_METAL_ADD_KERNEL(get_rows_q4_1); + GGML_METAL_ADD_KERNEL(get_rows_q8_0); GGML_METAL_ADD_KERNEL(get_rows_q2_K); GGML_METAL_ADD_KERNEL(get_rows_q3_K); GGML_METAL_ADD_KERNEL(get_rows_q4_K); @@ -188,14 +239,28 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(get_rows_q6_K); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(norm); + GGML_METAL_ADD_KERNEL(mul_mat_f32_f32); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); + GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(cpy_f32_f16); @@ -205,34 +270,117 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #undef GGML_METAL_ADD_KERNEL } - fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); +#if TARGET_OS_OSX + metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { - fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__); + metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); } +#endif return ctx; } void ggml_metal_free(struct ggml_metal_context * ctx) { - fprintf(stderr, "%s: deallocating\n", __func__); + metal_printf("%s: deallocating\n", __func__); +#define GGML_METAL_DEL_KERNEL(name) \ + [ctx->function_##name release]; \ + [ctx->pipeline_##name release]; + + GGML_METAL_DEL_KERNEL(add); + GGML_METAL_DEL_KERNEL(add_row); + GGML_METAL_DEL_KERNEL(mul); + GGML_METAL_DEL_KERNEL(mul_row); + GGML_METAL_DEL_KERNEL(scale); + GGML_METAL_DEL_KERNEL(silu); + GGML_METAL_DEL_KERNEL(relu); + GGML_METAL_DEL_KERNEL(gelu); + GGML_METAL_DEL_KERNEL(soft_max); + GGML_METAL_DEL_KERNEL(soft_max_4); + GGML_METAL_DEL_KERNEL(diag_mask_inf); + GGML_METAL_DEL_KERNEL(diag_mask_inf_8); + GGML_METAL_DEL_KERNEL(get_rows_f32); + GGML_METAL_DEL_KERNEL(get_rows_f16); + GGML_METAL_DEL_KERNEL(get_rows_q4_0); + GGML_METAL_DEL_KERNEL(get_rows_q4_1); + GGML_METAL_DEL_KERNEL(get_rows_q8_0); + GGML_METAL_DEL_KERNEL(get_rows_q2_K); + GGML_METAL_DEL_KERNEL(get_rows_q3_K); + GGML_METAL_DEL_KERNEL(get_rows_q4_K); + GGML_METAL_DEL_KERNEL(get_rows_q5_K); + GGML_METAL_DEL_KERNEL(get_rows_q6_K); + GGML_METAL_DEL_KERNEL(rms_norm); + GGML_METAL_DEL_KERNEL(norm); + GGML_METAL_DEL_KERNEL(mul_mat_f32_f32); + GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row); + GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4); + GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); + GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DEL_KERNEL(rope); + GGML_METAL_DEL_KERNEL(alibi_f32); + GGML_METAL_DEL_KERNEL(cpy_f32_f16); + GGML_METAL_DEL_KERNEL(cpy_f32_f32); + GGML_METAL_DEL_KERNEL(cpy_f16_f16); + +#undef GGML_METAL_DEL_KERNEL + for (int i = 0; i < ctx->n_buffers; ++i) { [ctx->buffers[i].metal release]; } + + [ctx->library release]; + [ctx->queue release]; + [ctx->device release]; + + dispatch_release(ctx->d_queue); + free(ctx); } +void * ggml_metal_host_malloc(size_t n) { + void * data = NULL; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + metal_printf("%s: error: posix_memalign failed\n", __func__); + return NULL; + } + + return data; +} + +void ggml_metal_host_free(void * data) { + free(data); +} + void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { - ctx->n_cb = n_cb; + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } -bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) { - if (ctx->concur_list_len) { - return true; - } - return false; +int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { + return ctx->concur_list_len; +} + +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { + return ctx->concur_list; } // finds the Metal buffer that contains the tensor data on the GPU device @@ -240,7 +388,7 @@ bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -248,16 +396,17 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru for (int i = 0; i < ctx->n_buffers; ++i) { const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; + //metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name); if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - fprintf(stderr, "%s: error: buffer is nil\n", __func__); + metal_printf("%s: error: buffer is nil\n", __func__); return nil; } @@ -269,7 +418,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - fprintf(stderr, "%s: too many buffers\n", __func__); + metal_printf("%s: too many buffers\n", __func__); return false; } @@ -279,12 +428,12 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } - const size_t size_page = getpagesize(); + const size_t size_page = sysconf(_SC_PAGESIZE); size_t size_aligned = size; if ((size_aligned % size_page) != 0) { @@ -300,11 +449,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -324,28 +473,32 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - fprintf(stderr, "\n"); + metal_printf("\n"); } ++ctx->n_buffers; } } - fprintf(stderr, ", (%8.2f / %8.2f)", +#if TARGET_OS_OSX + metal_printf(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n"); + metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); } else { - fprintf(stderr, "\n"); + metal_printf("\n"); } +#else + metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); +#endif } return true; @@ -354,8 +507,6 @@ bool ggml_metal_add_buffer( void ggml_metal_set_tensor( struct ggml_metal_context * ctx, struct ggml_tensor * t) { - metal_printf("%s: set input for tensor '%s'\n", __func__, t->name); - size_t offs; id id_dst = ggml_metal_get_buffer(ctx, t, &offs); @@ -365,8 +516,6 @@ void ggml_metal_set_tensor( void ggml_metal_get_tensor( struct ggml_metal_context * ctx, struct ggml_tensor * t) { - metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name); - size_t offs; id id_src = ggml_metal_get_buffer(ctx, t, &offs); @@ -375,7 +524,7 @@ void ggml_metal_get_tensor( void ggml_metal_graph_find_concurrency( struct ggml_metal_context * ctx, - struct ggml_cgraph * gf) { + struct ggml_cgraph * gf, bool check_mem) { int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time int nodes_unused[GGML_MAX_CONCUR]; @@ -422,7 +571,7 @@ void ggml_metal_graph_find_concurrency( } } } - if (exe_flag) { + if (exe_flag && check_mem) { // check if nodes[i]'s data will be overwritten by a node before nodes[i]. // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] int64_t data_start = (int64_t) gf->nodes[i]->data; @@ -461,14 +610,14 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); + metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); } } void ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { - metal_printf("%s: evaluating graph\n", __func__); + @autoreleasepool { // if there is ctx->concur_list, dispatch concurrently // else fallback to serial dispatch @@ -484,46 +633,38 @@ void ggml_metal_graph_compute( const int n_cb = ctx->n_cb; - NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; - for (int i = 0; i < n_cb; ++i) { - command_buffers[i] = [ctx->queue commandBuffer]; + ctx->command_buffers[i] = [ctx->queue commandBuffer]; // enqueue the command buffers in order to specify their execution order - [command_buffers[i] enqueue]; - } + [ctx->command_buffers[i] enqueue]; - // TODO: is this the best way to start threads? - dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); + ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc]; + } for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; - dispatch_async(queue, ^{ + dispatch_async(ctx->d_queue, ^{ size_t offs_src0 = 0; size_t offs_src1 = 0; size_t offs_dst = 0; - id command_buffer = command_buffers[cb_idx]; + id command_buffer = ctx->command_buffers[cb_idx]; + id encoder = ctx->command_encoders[cb_idx]; - id encoder = nil; - - const int node_start = (cb_idx + 0) * n_nodes_per_cb; - const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb; + const int node_start = (cb_idx + 0) * n_nodes_per_cb; + const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); for (int ind = node_start; ind < node_end; ++ind) { const int i = has_concur ? ctx->concur_list[ind] : ind; if (i == -1) { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - continue; - } [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; continue; } - metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -592,12 +733,16 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ADD: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; if (ggml_nelements(src1) == ne10) { // src1 is a row + GGML_ASSERT(ne11 == 1); [encoder setComputePipelineState:ctx->pipeline_add_row]; } else { [encoder setComputePipelineState:ctx->pipeline_add]; @@ -605,20 +750,24 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_MUL: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; if (ggml_nelements(src1) == ne10) { // src1 is a row + GGML_ASSERT(ne11 == 1); [encoder setComputePipelineState:ctx->pipeline_mul_row]; } else { [encoder setComputePipelineState:ctx->pipeline_mul]; @@ -626,17 +775,15 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_SCALE: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } + GGML_ASSERT(ggml_is_contiguous(src0)); const float scale = *(const float *) src1->data; @@ -645,7 +792,7 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; @@ -653,24 +800,16 @@ void ggml_metal_graph_compute( switch (ggml_get_unary_op(gf->nodes[i])) { case GGML_UNARY_OP_SILU: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - [encoder setComputePipelineState:ctx->pipeline_silu]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_UNARY_OP_RELU: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - [encoder setComputePipelineState:ctx->pipeline_relu]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -681,58 +820,58 @@ void ggml_metal_graph_compute( } break; case GGML_UNARY_OP_GELU: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - [encoder setComputePipelineState:ctx->pipeline_gelu]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; default: { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; case GGML_OP_SOFT_MAX: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const int nth = 32; - [encoder setComputePipelineState:ctx->pipeline_soft_max]; + if (ne00%4 == 0) { + [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; + } else { + [encoder setComputePipelineState:ctx->pipeline_soft_max]; + } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_DIAG_MASK_INF: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const int n_past = ((int32_t *)(dst->op_params))[0]; - [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; + if (ne00%8 == 0) { + [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8]; + } else { + [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; + } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; - [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + if (ne00%8 == 0) { + [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } + else { + [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } } break; case GGML_OP_MUL_MAT: { @@ -740,63 +879,71 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne00 == ne10); // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere + uint gqa = ne12/ne02; GGML_ASSERT(ne03 == ne13); - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) { - - if (encoder != nil) { - [encoder endEncoding]; - encoder = nil; - } - - MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; - MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; - - // for F32 x F32 we use MPS - MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt]; - - MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt]; - - MPSMatrixDescriptor * desc = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32]; - - MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] - initWithDevice:ctx->device transposeLeft:false transposeRight:true - resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0]; - - // we need to do ne12 multiplications - // TODO: is there a way to do this in parallel - currently very slow .. - // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS - for (int64_t i02 = 0; i02 < ne12; ++i02) { - size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now - size_t offs_src1_cur = offs_src1 + i02*nb12; - size_t offs_dst_cur = offs_dst + i02*nb2; - - MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0]; - MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1]; - MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ]; - - [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst]; + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel + if (!ggml_is_transposed(src0) && + !ggml_is_transposed(src1) && + src1t == GGML_TYPE_F32 && + [ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne00%32 == 0 && + ne11 > 1) { + switch (src0->type) { + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; + case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break; + case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break; + case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break; + case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break; + case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break; + case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break; + case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break; + default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; + [encoder setBytes:&gqa length:sizeof(gqa) atIndex:13]; + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - int nth0 = 32; int nth1 = 1; + int nrows = 1; // use custom matrix x vector kernel switch (src0t) { + case GGML_TYPE_F32: + { + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32]; + nrows = 4; + } break; case GGML_TYPE_F16: { - nth0 = 64; + nth0 = 32; nth1 = 1; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; + if (ne11 * ne12 < 4) { + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; + } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4]; + nrows = ne11; + } else { + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; + nrows = 4; + } } break; case GGML_TYPE_Q4_0: { @@ -816,6 +963,15 @@ void ggml_metal_graph_compute( nth1 = 8; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32]; } break; + case GGML_TYPE_Q8_0: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32]; + } break; case GGML_TYPE_Q2_K: { GGML_ASSERT(ne02 == 1); @@ -839,8 +995,8 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne12 == 1); - nth0 = 2; - nth1 = 32; + nth0 = 4; //1; + nth1 = 8; //32; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; } break; case GGML_TYPE_Q5_K: @@ -863,7 +1019,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "Asserting on type %d\n",(int)src0t); + metal_printf("Asserting on type %d\n",(int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -885,39 +1041,41 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; + [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || - src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 || + src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q3_K) { #ifdef GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #else - [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #endif } else if (src0t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { - [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + int64_t ny = (ne11 + nrows - 1)/nrows; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } } } break; case GGML_OP_GET_ROWS: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - switch (src0->type) { - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break; + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; + case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break; case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break; case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break; case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break; @@ -929,9 +1087,9 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4]; - [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5]; const int64_t n = ggml_nelements(src1); @@ -939,10 +1097,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_RMS_NORM: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -962,20 +1116,17 @@ void ggml_metal_graph_compute( } break; case GGML_OP_NORM: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - - const float eps = 1e-5f; + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); const int nth = 256; [encoder setComputePipelineState:ctx->pipeline_norm]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; const int64_t nrows = ggml_nrows(src0); @@ -984,10 +1135,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ALIBI: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - GGML_ASSERT((src0t == GGML_TYPE_F32)); const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); @@ -1022,15 +1169,13 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; + const int nth = 32; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ROPE: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; @@ -1041,8 +1186,8 @@ void ggml_metal_graph_compute( memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); [encoder setComputePipelineState:ctx->pipeline_rope]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; @@ -1065,16 +1210,12 @@ void ggml_metal_graph_compute( [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; } break; case GGML_OP_DUP: case GGML_OP_CPY: case GGML_OP_CONT: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const int nth = 32; switch (src0t) { @@ -1097,30 +1238,30 @@ void ggml_metal_graph_compute( default: GGML_ASSERT(false && "not implemented"); } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; default: { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1136,17 +1277,34 @@ void ggml_metal_graph_compute( } // wait for all threads to finish - dispatch_barrier_sync(queue, ^{}); - - [command_buffers[n_cb - 1] waitUntilCompleted]; + dispatch_barrier_sync(ctx->d_queue, ^{}); // check status of command buffers // needed to detect if the device ran out-of-memory for example (#1881) for (int i = 0; i < n_cb; i++) { - MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status]; + [ctx->command_buffers[i] waitUntilCompleted]; + + MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); + if (status == MTLCommandBufferStatusError) { + // Check Metal error code + NSError *error = (MTLCommandBufferError) [ctx->command_buffers[i] error]; + int mtl_error_code = [error code]; + if (([error domain] == MTLCommandBufferErrorDomain) && ([error code] == MTLCommandBufferErrorOutOfMemory)) { + metal_printf("%s: command buffer %d failed with status MTLCommandBufferStatus.error (5) and error code \ +MTLCommandBufferError.outOfMemory (8)\n"); + printf("Metal ran out of memory. Maybe try a smaller context size, or a smaller (more coarsely quantized) model, \ +preferably one under the recommended max working set size, or else fall back to running on CPU only.\n"); + } else { + metal_printf("%s: command buffer %d failed with status MTLCommandBufferStatus.error (5) and error code %d\n", + __func__, i, mtl_error_code); + } + } else { + metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); + } GGML_ASSERT(false); } } + + } } diff --git a/ggml-metal.metal b/ggml-metal.metal index 8d26b5ec2dfa4f649b7a6d478f73238a0c700da6..7f1c3d9ea74bd031ae831bfe916ab8fa30d211a9 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -18,51 +18,16 @@ typedef struct { uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; -static void dequantize_row_q4_0(device const block_q4_0 * x, device float * y, int k) { - const int qk = QK4_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const half d = x[i].d; - - for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0x0F) - 8; - const int x1 = (x[i].qs[j] >> 4) - 8; - - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; - } - } -} - -static void dequantize_row_q4_1(device const block_q4_1 * x, device float * y, int k) { - const int qk = QK4_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const half d = x[i].d; - const half m = x[i].m; - - for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0x0F); - const int x1 = (x[i].qs[j] >> 4); - - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; - } - } -} +#define QK8_0 32 +typedef struct { + half d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; kernel void kernel_add( - device const float * src0, - device const float * src1, - device float * dst, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig]; } @@ -70,18 +35,18 @@ kernel void kernel_add( // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_add_row( - device const float * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] + src1[tpig % ne00]; + dst[tpig] = src0[tpig] + src1[tpig % nb]; } kernel void kernel_mul( - device const float * src0, - device const float * src1, - device float * dst, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * src1[tpig]; } @@ -89,27 +54,27 @@ kernel void kernel_mul( // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_mul_row( - device const float * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src1[tpig % ne00]; + dst[tpig] = src0[tpig] * src1[tpig % nb]; } kernel void kernel_scale( - device const float * src0, - device float * dst, + device const float4 * src0, + device float4 * dst, constant float & scale, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * scale; } kernel void kernel_silu( - device const float * src0, - device float * dst, + device const float4 * src0, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { - float x = src0[tpig]; + device const float4 & x = src0[tpig]; dst[tpig] = x / (1.0f + exp(-x)); } @@ -124,11 +89,16 @@ constant float GELU_COEF_A = 0.044715f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; kernel void kernel_gelu( - device const float * src0, - device float * dst, + device const float4 * src0, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { - float x = src0[tpig]; - dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); + device const float4 & x = src0[tpig]; + + // BEWARE !!! + // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs! + // This was observed with Falcon 7B and 40B models + // + dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); } kernel void kernel_soft_max( @@ -137,7 +107,6 @@ kernel void kernel_soft_max( constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, - threadgroup float * buf [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { @@ -149,55 +118,67 @@ kernel void kernel_soft_max( device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; // parallel max - buf[tpitg[0]] = -INFINITY; - for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { - buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]); + float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY; + for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) { + lmax = MAX(lmax, psrc0[i00]); } + const float max = simd_max(lmax); - // reduce - threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = ntg[0]/2; i > 0; i /= 2) { - if (tpitg[0] < i) { - buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]); - } - threadgroup_barrier(mem_flags::mem_threadgroup); + // parallel sum + float lsum = 0.0f; + for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + const float exp_psrc0 = exp(psrc0[i00] - max); + lsum += exp_psrc0; + // Remember the result of exp here. exp is expensive, so we really do not + // whish to compute it twice. + pdst[i00] = exp_psrc0; } - // broadcast - if (tpitg[0] == 0) { - buf[0] = buf[0]; + const float sum = simd_sum(lsum); + + for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { + pdst[i00] /= sum; } +} - threadgroup_barrier(mem_flags::mem_threadgroup); +kernel void kernel_soft_max_4( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; - const float max = buf[0]; + device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - // parallel sum - buf[tpitg[0]] = 0.0f; - for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { - buf[tpitg[0]] += exp(psrc0[i00] - max); + // parallel max + float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY; + for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) { + lmax4 = fmax(lmax4, psrc4[i00]); } + float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); - // reduce - threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = ntg[0]/2; i > 0; i /= 2) { - if (tpitg[0] < i) { - buf[tpitg[0]] += buf[tpitg[0] + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - } + const float max = simd_max(lmax); - // broadcast - if (tpitg[0] == 0) { - buf[0] = buf[0]; + // parallel sum + float4 lsum4 = 0.0f; + for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { + const float4 exp_psrc4 = exp(psrc4[i00] - max); + lsum4 += exp_psrc4; + pdst4[i00] = exp_psrc4; } + float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; - threadgroup_barrier(mem_flags::mem_threadgroup); - - const float sum = buf[0]; + const float sum = simd_sum(lsum); - for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { - pdst[i00] = exp(psrc0[i00] - max) / sum; + for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) { + pdst4[i00] /= sum; } } @@ -216,57 +197,36 @@ kernel void kernel_diag_mask_inf( dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY; } else { dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00]; - } + } } -kernel void kernel_get_rows_f16( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; +kernel void kernel_diag_mask_inf_8( + device const float4 * src0, + device float4 * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int & n_past, + uint3 tpig[[thread_position_in_grid]]) { - for (int j = 0; j < ne00; j++) { - dst[i*nb1 + j] = ((device half *) ((device char *) src0 + r*nb01))[j]; + const int64_t i = 2*tpig[0]; + + dst[i+0] = src0[i+0]; + dst[i+1] = src0[i+1]; + int64_t i4 = 4*i; + const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01; + const int64_t i01 = i4/(ne00); i4 -= i01*ne00; + const int64_t i00 = i4; + for (int k = 3; k >= 0; --k) { + if (i00 + 4 + k <= n_past + i01) { + break; + } + dst[i+1][k] = -INFINITY; + if (i00 + k > n_past + i01) { + dst[i][k] = -INFINITY; + } } } -kernel void kernel_get_rows_q4_0( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q4_0( - (device const block_q4_0 *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - -kernel void kernel_get_rows_q4_1( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q4_1( - (device const block_q4_1 *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - kernel void kernel_norm( device const void * src0, device float * dst, @@ -292,25 +252,17 @@ kernel void kernel_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); } - // broadcast - if (tpitg == 0) { - sum[0] /= ne00; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - const float mean = sum[0]; + const float mean = sum[0] / ne00; - // recenter + // recenter and VARIANCE + threadgroup_barrier(mem_flags::mem_threadgroup); device float * y = dst + tgpig*ne00; - for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - y[i00] = x[i00] - mean; - } - - // VARIANCE - // parallel sum sum[tpitg] = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + y[i00] = x[i00] - mean; sum[tpitg] += y[i00] * y[i00]; } + // reduce threadgroup_barrier(mem_flags::mem_threadgroup); for (uint i = ntg/2; i > 0; i /= 2) { @@ -319,12 +271,7 @@ kernel void kernel_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); } - // broadcast - if (tpitg == 0) { - sum[0] /= ne00; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - const float variance = sum[0]; + const float variance = sum[0] / ne00; const float scale = 1.0f/sqrt(variance + eps); for (int i00 = tpitg; i00 < ne00; i00 += ntg) { @@ -332,7 +279,6 @@ kernel void kernel_norm( } } - kernel void kernel_rms_norm( device const void * src0, device float * dst, @@ -432,14 +378,16 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre // N_DST, so this is another explicit assumption of the implementation. template void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst, - int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01, - uint2 tgpig, uint tiisg, uint sgitg) { + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa, + uint3 tgpig, uint tiisg, uint sgitg) { const int nb = ne00/QK4_0; const int r0 = tgpig.x; const int r1 = tgpig.y; + const int im = tgpig.z; const int first_row = (r0 * nsg + sgitg) * nr; - device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb; - device const float * y = (device const float *) src1 + r1*ne10; + const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + device const block_q_type * x = (device const block_q_type *) src0 + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; float yl[16]; // src1 vector cache float sumf[nr]={0.f}; @@ -470,7 +418,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device for (int row = 0; row < nr; ++row) { const float tot = simd_sum(sumf[row]); if (tiisg == 0 && first_row + row < ne01) { - dst[r1*ne0 + first_row + row] = tot; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; } } } @@ -480,13 +428,17 @@ kernel void kernel_mul_mat_q4_0_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); } kernel void kernel_mul_mat_q4_1_f32( @@ -494,16 +446,86 @@ kernel void kernel_mul_mat_q4_1_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); } -kernel void kernel_mul_mat_f16_f32( +#define NB_Q8_0 8 + +kernel void kernel_mul_mat_q8_0_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + const int nr = N_DST; + const int nsg = N_SIMDGROUP; + const int nw = N_SIMDWIDTH; + + const int nb = ne00/QK8_0; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + const int first_row = (r0 * nsg + sgitg) * nr; + const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[NB_Q8_0]; + float sumf[nr]={0.f}; + + const int ix = tiisg/4; + const int il = tiisg%4; + + device const float * yb = y + ix * QK8_0 + NB_Q8_0*il; + + // each thread in a SIMD group deals with NB_Q8_0 quants at a time + for (int ib = ix; ib < nb; ib += nw/4) { + for (int i = 0; i < NB_Q8_0; ++i) { + yl[i] = yb[i]; + } + + for (int row = 0; row < nr; row++) { + device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il; + float sumq = 0.f; + for (int iq = 0; iq < NB_Q8_0; ++iq) { + sumq += qs[iq] * yl[iq]; + } + sumf[row] += sumq*x[ib+row*nb].d; + } + + yb += NB_Q8_0 * nw; + } + + for (int row = 0; row < nr; ++row) { + const float tot = simd_sum(sumf[row]); + if (tiisg == 0 && first_row + row < ne01) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; + } + } +} + +#define N_F32_F32 4 + +kernel void kernel_mul_mat_f32_f32( device const char * src0, device const char * src1, device float * dst, @@ -521,11 +543,79 @@ kernel void kernel_mul_mat_f16_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - threadgroup float * sum [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint3 tpig[[thread_position_in_grid]], - uint3 tpitg[[thread_position_in_threadgroup]], - uint3 tptg[[threads_per_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]]) { + + const int64_t r0 = tgpig.x; + const int64_t rb = tgpig.y*N_F32_F32; + const int64_t im = tgpig.z; + + device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + + if (ne00 < 128) { + for (int row = 0; row < N_F32_F32; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + for (int i = tiisg; i < ne00; i += 32) { + sumf += (float) x[i] * (float) y[i]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } else { + device const float4 * x4 = (device const float4 *)x; + for (int row = 0; row < N_F32_F32; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + device const float4 * y4 = (device const float4 *) y; + + float sumf = 0; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } + } +} + +kernel void kernel_mul_mat_f16_f32_1row( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; @@ -534,26 +624,145 @@ kernel void kernel_mul_mat_f16_f32( device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); - sum[tpitg.x] = 0.0f; - - for (int i = tpitg.x; i < ne00; i += tptg.x) { - sum[tpitg.x] += (float) x[i] * (float) y[i]; + float sumf = 0; + if (ne00 < 128) { + for (int i = tiisg; i < ne00; i += 32) { + sumf += (float) x[i] * (float) y[i]; + } + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } else { + device const half4 * x4 = (device const half4 *) x; + device const float4 * y4 = (device const float4 *) y; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k]; + } + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } } - // accumulate the sum from all threads in the threadgroup - threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = tptg.x/2; i > 0; i /= 2) { - if (tpitg.x < i) { - sum[tpitg.x] += sum[tpitg.x + i]; +} + +#define N_F16_F32 4 + +kernel void kernel_mul_mat_f16_f32( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int64_t r0 = tgpig.x; + const int64_t rb = tgpig.y*N_F16_F32; + const int64_t im = tgpig.z; + + device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + + if (ne00 < 128) { + for (int row = 0; row < N_F16_F32; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } + + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + for (int i = tiisg; i < ne00; i += 32) { + sumf += (float) x[i] * (float) y[i]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } } - threadgroup_barrier(mem_flags::mem_threadgroup); - } + } else { + device const half4 * x4 = (device const half4 *)x; + for (int row = 0; row < N_F16_F32; ++row) { + int r1 = rb + row; + if (r1 >= ne11) { + break; + } - if (tpitg.x == 0) { - dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0]; + device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); + device const float4 * y4 = (device const float4 *) y; + + float sumf = 0; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i]; + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } } } +// Assumes row size (ne00) is a multiple of 4 +kernel void kernel_mul_mat_f16_f32_l4( + device const char * src0, + device const char * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]]) { + + const int nrows = ne11; + const int64_t r0 = tgpig.x; + const int64_t im = tgpig.z; + + device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); + + for (int r1 = 0; r1 < nrows; ++r1) { + device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12); + + float sumf = 0; + for (int i = tiisg; i < ne00/4; i += 32) { + for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k]; + } + + float all_sum = simd_sum(sumf); + if (tiisg == 0) { + dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum; + } + } +} kernel void kernel_alibi_f32( device const float * src0, @@ -621,25 +830,27 @@ kernel void kernel_rope( constant int & mode, constant float & freq_base, constant float & freq_scale, - uint3 tpig[[thread_position_in_grid]]) { - const int64_t i3 = tpig[2]; - const int64_t i2 = tpig[1]; - const int64_t i1 = tpig[0]; + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg[[threads_per_threadgroup]], + uint3 tgpig[[threadgroup_position_in_grid]]) { + const int64_t i3 = tgpig[2]; + const int64_t i2 = tgpig[1]; + const int64_t i1 = tgpig[0]; const bool is_neox = mode & 2; - const float theta_scale = pow(freq_base, -2.0f/n_dims); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); - float theta = freq_scale * (float)p; + const float theta_0 = freq_scale * (float)p; + const float inv_ndims = -1.f/n_dims; if (!is_neox) { - for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*i0); const float cos_theta = cos(theta); const float sin_theta = sin(theta); - theta *= theta_scale; - device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -650,7 +861,25 @@ kernel void kernel_rope( dst_data[1] = x0*sin_theta + x1*cos_theta; } } else { - // TODO: implement + for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { + for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + + const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); + const float cos_theta = cos(theta); + const float sin_theta = sin(theta); + + const int64_t i0 = ib*n_dims + ic/2; + + device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + const float x0 = src[0]; + const float x1 = src[n_dims/2]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } + } } } @@ -779,442 +1008,94 @@ kernel void kernel_cpy_f32_f32( } } -//============================================ k-quants ====================================================== - -#ifndef QK_K -#define QK_K 256 -#else -static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64"); -#endif - -#if QK_K == 256 -#define K_SCALE_SIZE 12 -#else -#define K_SCALE_SIZE 4 -#endif - -typedef struct { - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits - uint8_t qs[QK_K/4]; // quants - half d; // super-block scale for quantized scales - half dmin; // super-block scale for quantized mins -} block_q2_K; -// 84 bytes / block - -typedef struct { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits -#if QK_K == 64 - uint8_t scales[2]; -#else - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits -#endif - half d; // super-block scale -} block_q3_K; - -#if QK_K == 64 -typedef struct { - half d[2]; // super-block scales/mins - uint8_t scales[2]; - uint8_t qs[QK_K/2]; // 4-bit quants -} block_q4_K; -#else -typedef struct { - half d; // super-block scale for quantized scales - half dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_K; -#endif - -#if QK_K == 64 -typedef struct { - half d; // super-block scales/mins - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_K; -#else -typedef struct { - half d; // super-block scale for quantized scales - half dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_K; -// 176 bytes / block -#endif - -typedef struct { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - half d; // super-block scale -} block_q6_K; -// 210 bytes / block - -static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { - uchar4 r; - if (j < 4) { - r[0] = q[j+0] & 63; - r[2] = q[j+1] & 63; - r[1] = q[j+4] & 63; - r[3] = q[j+5] & 63; - } else { - r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4); - r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4); - } - return r; -} - -//========================================== dequantization ============================= - -static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - - for (int i = 0; i < nb; i++) { - - const float d = x[i].d; - const float min = x[i].dmin; - - device const uint8_t * q = x[i].qs; - -#if QK_K == 256 - int is = 0; - float dl, ml; - for (int n = 0; n < QK_K; n += 128) { - int shift = 0; - for (int j = 0; j < 4; ++j) { - - uint8_t sc = x[i].scales[is++]; - dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; - - sc = x[i].scales[is++]; - dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; - - shift += 2; - } - q += 32; - } -#else - float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); - float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); - float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); - float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); - for (int l = 0; l < 16; ++l) { - y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1; - y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2; - y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3; - y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4; - } - y += QK_K; -#endif - - } -} - -static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - -#if QK_K == 256 - - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - - uint16_t aux[8]; - thread const int8_t * scales = (thread const int8_t*)aux; - - for (int i = 0; i < nb; i++) { - - const float d_all = (float)(x[i].d); - - device const uint8_t * q = x[i].qs; - device const uint8_t * h = x[i].hmask; - uint8_t m = 1; - - device const uint16_t * a = (device const uint16_t *)x[i].scales; - aux[0] = (a[0] & kmask2) | (((a[4] >> 0) & kmask1) << 4); - aux[1] = (a[1] & kmask2) | (((a[5] >> 0) & kmask1) << 4); - aux[2] = (a[2] & kmask2) | (((a[4] >> 2) & kmask1) << 4); - aux[3] = (a[3] & kmask2) | (((a[5] >> 2) & kmask1) << 4); - aux[4] = ((a[0] >> 4) & kmask2) | (((a[4] >> 4) & kmask1) << 4); - aux[5] = ((a[1] >> 4) & kmask2) | (((a[5] >> 4) & kmask1) << 4); - aux[6] = ((a[2] >> 4) & kmask2) | (((a[4] >> 6) & kmask1) << 4); - aux[7] = ((a[3] >> 4) & kmask2) | (((a[5] >> 6) & kmask1) << 4); - - int is = 0; - float dl; - for (int n = 0; n < QK_K; n += 128) { - int shift = 0; - for (int j = 0; j < 4; ++j) { - - dl = d_all * (scales[is++] - 32); - for (int l = 0; l < 16; ++l) { - *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((h[l+ 0] & m) ? 0 : 4)); - } - - dl = d_all * (scales[is++] - 32); - for (int l = 0; l < 16; ++l) { - *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((h[l+16] & m) ? 0 : 4)); - } - - shift += 2; - m <<= 1; - } - q += 32; - } - } -#else - for (int i = 0; i < nb; i++) { - - const float d_all = (float)(x[i].d); - - device const uint8_t * q = x[i].qs; - device const uint8_t * hm = x[i].hmask; - - const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); - const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); - const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); - const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); - - for (int l = 0; l < 8; ++l) { - uint8_t h = hm[l]; - y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); - y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); - y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); - y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); - y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); - y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); - y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); - y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); - } - y += QK_K; - } -#endif - -} - -static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - - for (int i = 0; i < nb; i++) { - - device const uint8_t * q = x[i].qs; - -#if QK_K == 256 - const float d = x[i].d; - const float min = x[i].dmin; - - device const uint8_t * scales = x[i].scales; - - int is = 0; - for (int j = 0; j < QK_K; j += 64) { - const uchar4 sc = get_scale_min_k4(is, scales); - const float d1 = d * sc[0]; const float m1 = min * sc[1]; - const float d2 = d * sc[2]; const float m2 = min * sc[3]; - for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1; - for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; - q += 32; is += 2; - } -#else - device const uint8_t * s = x[i].scales; - device const half2 * dh = (device const half2 *)x[i].d; - const float2 d = (float2)dh[0]; - const float d1 = d[0] * (s[0] & 0xF); - const float d2 = d[0] * (s[1] & 0xF); - const float m1 = d[1] * (s[0] >> 4); - const float m2 = d[1] * (s[1] >> 4); - for (int l = 0; l < 32; ++l) { - y[l+ 0] = d1 * (q[l] & 0xF) - m1; - y[l+32] = d2 * (q[l] >> 4) - m2; - } - y += QK_K; -#endif - - } -} - -static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - -#if QK_K == 256 - for (int i = 0; i < nb; i++) { - - const float d = (float)(x[i].d); - const float min = (float)(x[i].dmin); - - device const uint8_t * ql = x[i].qs; - device const uint8_t * qh = x[i].qh; - - int is = 0; - uint8_t u1 = 1, u2 = 2; - for (int j = 0; j < QK_K; j += 64) { - const uchar4 sc = get_scale_min_k4(is, x[i].scales); - const float d1 = d * sc[0]; const float m1 = min * sc[1]; - const float d2 = d * sc[2]; const float m2 = min * sc[3]; - for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1; - for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2; - ql += 32; is += 2; - u1 <<= 2; u2 <<= 2; - } - } -#else - for (int i = 0; i < nb; i++) { - - const float d = (float)x[i].d; - - device const uint8_t * ql = x[i].qs; - device const uint8_t * qh = x[i].qh; - device const int8_t * sc = x[i].scales; - - for (int l = 0; l < 8; ++l) { - y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); - y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); - y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); - y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); - y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16)); - y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16)); - y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16)); - y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16)); - } - y += QK_K; - } -#endif - -} - -static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - - for (int i = 0; i < nb; i++) { - - device const uint8_t * ql = x[i].ql; - device const uint8_t * qh = x[i].qh; - device const int8_t * sc = x[i].scales; - - const float d = x[i].d; - -#if QK_K == 256 - for (int n = 0; n < QK_K; n += 128) { - for (int l = 0; l < 32; ++l) { - int is = l/16; - const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - y[l + 0] = d * sc[is + 0] * q1; - y[l + 32] = d * sc[is + 2] * q2; - y[l + 64] = d * sc[is + 4] * q3; - y[l + 96] = d * sc[is + 6] * q4; - } - y += 128; - ql += 64; - qh += 32; - sc += 8; - } -#else - for (int l = 0; l < 16; ++l) { - const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - y[l+ 0] = d * sc[0] * q1; - y[l+16] = d * sc[1] * q2; - y[l+32] = d * sc[2] * q3; - y[l+48] = d * sc[3] * q4; - } - y += 64; -#endif - } -} - -kernel void kernel_get_rows_q2_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q2_K( - (device const block_q2_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - -kernel void kernel_get_rows_q3_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q3_K( - (device const block_q3_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - -kernel void kernel_get_rows_q4_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; +//============================================ k-quants ====================================================== - dequantize_row_q4_K( - (device const block_q4_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} +#ifndef QK_K +#define QK_K 256 +#else +static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64"); +#endif -kernel void kernel_get_rows_q5_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; +#if QK_K == 256 +#define K_SCALE_SIZE 12 +#else +#define K_SCALE_SIZE 4 +#endif - dequantize_row_q5_K( - (device const block_q5_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} +typedef struct { + uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits + uint8_t qs[QK_K/4]; // quants + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins +} block_q2_K; +// 84 bytes / block -kernel void kernel_get_rows_q6_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; +typedef struct { + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits +#if QK_K == 64 + uint8_t scales[2]; +#else + uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits +#endif + half d; // super-block scale +} block_q3_K; + +#if QK_K == 64 +typedef struct { + half d[2]; // super-block scales/mins + uint8_t scales[2]; + uint8_t qs[QK_K/2]; // 4-bit quants +} block_q4_K; +#else +typedef struct { + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +#endif + +#if QK_K == 64 +typedef struct { + half d; // super-block scales/mins + int8_t scales[QK_K/16]; // 8-bit block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +#else +typedef struct { + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins + uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +// 176 bytes / block +#endif + +typedef struct { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales, quantized with 8 bits + half d; // super-block scale +} block_q6_K; +// 210 bytes / block - dequantize_row_q6_K( - (device const block_q6_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); +static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { + uchar4 r; + if (j < 4) { + r[0] = q[j+0] & 63; + r[2] = q[j+1] & 63; + r[1] = q[j+4] & 63; + r[3] = q[j+5] & 63; + } else { + r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4); + r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4); + } + return r; } //====================================== dot products ========================= @@ -1224,21 +1105,27 @@ kernel void kernel_mul_mat_q2_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; + const int r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row; - device const float * y = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -1351,7 +1238,7 @@ kernel void kernel_mul_mat_q2_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = all_sum; + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1362,10 +1249,14 @@ kernel void kernel_mul_mat_q3_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - constant int64_t & ne1, - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1373,37 +1264,47 @@ kernel void kernel_mul_mat_q3_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; + const int64_t r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; - device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb; - device const float * yy = (device const float *) src1 + r1*ne10; - - float yl[16]; + float yl[32]; - const uint16_t kmask1 = 0x0303; + const uint16_t kmask1 = 0x3030; const uint16_t kmask2 = 0x0f0f; - const int tid = tiisg/2; - const int ix = tiisg%2; - const int ip = tid/8; // 0 or 1 - const int il = tid/2 - 4*ip; // 0...3 + const int tid = tiisg/4; + const int ix = tiisg%4; + const int ip = tid/4; // 0 or 1 + const int il = 2*((tid%4)/2); // 0 or 2 const int ir = tid%2; const int n = 8; const int l0 = n*ir; - const uint16_t m1 = 1 << (4*ip + il); - const uint16_t m2 = m1 << 8; + // One would think that the Metal compiler would figure out that ip and il can only have + // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it + // with these two tales. + // + // Possible masks for the high bit + const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200}, // ip = 0, il = 0 + {0x0004, 0x0400, 0x0008, 0x0800}, // ip = 0, il = 2 + {0x0010, 0x1000, 0x0020, 0x2000}, // ip = 1, il = 0 + {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2 + + // Possible masks for the low 2 bits + const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}}; + + const ushort4 hm = mm[2*ip + il/2]; const int shift = 2*il; - const uint16_t qm1 = 0x0003 << shift; - const uint16_t qm2 = 0x0300 << shift; - const int32_t v1 = 4 << shift; - const int32_t v2 = 1024 << shift; + const float v1 = il == 0 ? 4.f : 64.f; + const float v2 = 4.f * v1; const uint16_t s_shift1 = 4*ip; - const uint16_t s_shift2 = s_shift1 + 2*(il/2); - const int ik = 4 + (il%2); + const uint16_t s_shift2 = s_shift1 + il; const int q_offset = 32*ip + l0; const int y_offset = 128*ip + 32*il + l0; @@ -1412,12 +1313,19 @@ kernel void kernel_mul_mat_q3_K_f32( device const float * y1 = yy + ix*QK_K + y_offset; - float sumf1[2] = {0.f}, sumf2[2] = {0.f}; - for (int i = ix; i < nb; i += 2) { + uint32_t scales32, aux32; + thread uint16_t * scales16 = (thread uint16_t *)&scales32; + thread const int8_t * scales = (thread const int8_t *)&scales32; + + float sumf1[2] = {0.f}; + float sumf2[2] = {0.f}; + for (int i = ix; i < nb; i += 4) { for (int l = 0; l < 8; ++l) { - yl[l+0] = y1[l+ 0]; - yl[l+8] = y1[l+16]; + yl[l+ 0] = y1[l+ 0]; + yl[l+ 8] = y1[l+16]; + yl[l+16] = y1[l+32]; + yl[l+24] = y1[l+48]; } device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset); @@ -1428,27 +1336,43 @@ kernel void kernel_mul_mat_q3_K_f32( for (int row = 0; row < 2; ++row) { const float d_all = (float)dh[0]; - const char2 scales = as_type((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4))); - float s1 = 0, s2 = 0; + scales16[0] = a[4]; + scales16[1] = a[5]; + aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030; + scales16[0] = a[il+0]; + scales16[1] = a[il+1]; + scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32; + + float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0; for (int l = 0; l < n; l += 2) { - const uint16_t qs = q[l/2]; - s1 += yl[l+0] * ((int32_t)(qs & qm1) - ((h[l/2] & m1) ? 0 : v1)); - s2 += yl[l+1] * ((int32_t)(qs & qm2) - ((h[l/2] & m2) ? 0 : v2)); + const int32_t qs = q[l/2]; + s1 += yl[l+0] * (qs & qm[il/2][0]); + s2 += yl[l+1] * (qs & qm[il/2][1]); + s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]); + s4 += yl[l+16] * (qs & qm[il/2][2]); + s5 += yl[l+17] * (qs & qm[il/2][3]); + s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]); } - float d = d_all * (s1 + 1.f/256.f * s2); - sumf1[row] += d * scales[0]; - sumf2[row] += d; + float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1); + float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2); + sumf1[row] += d1 * (scales[0] - 32); + sumf2[row] += d2 * (scales[2] - 32); - s1 = s2 = 0; + s1 = s2 = s3 = s4 = s5 = s6 = 0; for (int l = 0; l < n; l += 2) { - const uint16_t qs = q[l/2+8]; - s1 += yl[l+8] * ((int32_t)(qs & qm1) - ((h[l/2+8] & m1) ? 0 : v1)); - s2 += yl[l+9] * ((int32_t)(qs & qm2) - ((h[l/2+8] & m2) ? 0 : v2)); + const int32_t qs = q[l/2+8]; + s1 += yl[l+8] * (qs & qm[il/2][0]); + s2 += yl[l+9] * (qs & qm[il/2][1]); + s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]); + s4 += yl[l+24] * (qs & qm[il/2][2]); + s5 += yl[l+25] * (qs & qm[il/2][3]); + s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]); } - d = d_all * (s1 + 1.f/256.f * s2); - sumf1[row] += d * scales[1]; - sumf2[row] += d; + d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1); + d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2); + sumf1[row] += d1 * (scales[1] - 32); + sumf2[row] += d2 * (scales[3] - 32); q += step; h += step; @@ -1457,15 +1381,17 @@ kernel void kernel_mul_mat_q3_K_f32( } - y1 += 2 * QK_K; + y1 += 4 * QK_K; } for (int row = 0; row < 2; ++row) { - const float sumf = (sumf1[row] - 32.f*sumf2[row]) / (1 << shift); - const float tot = simd_sum(sumf); - if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = tot; + const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift); + sumf1[row] = simd_sum(sumf); + } + if (tiisg == 0) { + for (int row = 0; row < 2; ++row) { + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row]; } } } @@ -1475,10 +1401,14 @@ kernel void kernel_mul_mat_q3_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - constant int64_t & ne1, - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1486,11 +1416,12 @@ kernel void kernel_mul_mat_q3_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; + const int64_t r2 = tgpig.z; const int row = 2 * r0 + sgitg; - - device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb; - device const float * yy = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; const int ix = tiisg/4; const int il = 4 * (tiisg%4);// 0, 4, 8, 12 const int im = il/8; // 0, 0, 1, 1 @@ -1529,7 +1460,7 @@ kernel void kernel_mul_mat_q3_K_f32( const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + row] = tot; + dst[r1*ne0 + r2*ne0*ne1 + row] = tot; } } @@ -1541,10 +1472,14 @@ kernel void kernel_mul_mat_q4_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01 [[buffer(4)]], + constant int64_t & ne02 [[buffer(5)]], + constant int64_t & ne10 [[buffer(9)]], + constant int64_t & ne12 [[buffer(11)]], + constant int64_t & ne0 [[buffer(15)]], + constant int64_t & ne1 [[buffer(16)]], + constant uint & gqa [[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1560,10 +1495,13 @@ kernel void kernel_mul_mat_q4_K_f32( const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; - const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int r2 = tgpig.z; + //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int first_row = r0 * N_DST; const int ib_row = first_row * nb; - device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row; - device const float * y = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float yl[16]; float yh[16]; float sumf[N_DST]={0.f}, all_sum; @@ -1630,7 +1568,7 @@ kernel void kernel_mul_mat_q4_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = all_sum; + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1640,10 +1578,14 @@ kernel void kernel_mul_mat_q4_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1653,10 +1595,12 @@ kernel void kernel_mul_mat_q4_K_f32( const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; + const int r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row; - device const float * y = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float yl[8]; float yh[8]; float sumf[N_DST]={0.f}, all_sum; @@ -1712,7 +1656,7 @@ kernel void kernel_mul_mat_q4_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = all_sum; + dst[r1*ne0+ r2*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1723,9 +1667,14 @@ kernel void kernel_mul_mat_q5_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1733,11 +1682,12 @@ kernel void kernel_mul_mat_q5_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; + const int r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; - - device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb; - device const float * yy = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float sumf[2]={0.f}; @@ -1796,17 +1746,25 @@ kernel void kernel_mul_mat_q5_K_f32( sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2); sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2); - float4 acc = {0.f, 0.f, 0.f, 0.f}; + float4 acc1 = {0.f}; + float4 acc2 = {0.f}; for (int l = 0; l < n; ++l) { uint8_t h = qh[l]; - acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0)); - acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0)); - acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0)); - acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0)); + acc1[0] += yl[l+0] * (q1[l] & 0x0F); + acc1[1] += yl[l+8] * (q1[l] & 0xF0); + acc1[2] += yh[l+0] * (q2[l] & 0x0F); + acc1[3] += yh[l+8] * (q2[l] & 0xF0); + acc2[0] += h & hm1 ? yl[l+0] : 0.f; + acc2[1] += h & hm2 ? yl[l+8] : 0.f; + acc2[2] += h & hm3 ? yh[l+0] : 0.f; + acc2[3] += h & hm4 ? yh[l+8] : 0.f; } const float dall = dh[0]; const float dmin = dh[1]; - sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) - + sumf[row] += dall * (sc8[0] * (acc1[0] + 16.f*acc2[0]) + + sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) + + sc8[4] * (acc1[2] + 16.f*acc2[2]) + + sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) - dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]); q1 += step; @@ -1871,7 +1829,7 @@ kernel void kernel_mul_mat_q5_K_f32( for (int row = 0; row < 2; ++row) { const float tot = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = tot; + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; } } @@ -1882,9 +1840,14 @@ kernel void kernel_mul_mat_q6_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1897,11 +1860,12 @@ kernel void kernel_mul_mat_q6_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; + const int r2 = tgpig.z; const int row = 2 * r0 + sgitg; - - device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb; - device const float * yy = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float sumf = 0; @@ -1967,6 +1931,423 @@ kernel void kernel_mul_mat_q6_K_f32( const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + row] = tot; + dst[r1*ne0 + r2*ne0*ne1 + row] = tot; + } +} + +//============================= templates and their specializations ============================= + +// NOTE: this is not dequantizing - we are simply fitting the template +template +void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { + float4x4 temp = *(((device float4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { + half4x4 temp = *(((device half4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float md = -8.h * xb->d; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; + reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; + } +} + +template +void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float m = xb->m; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; + reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; + } +} + +template +void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) { + device const int8_t * qs = ((device const int8_t *)xb->qs); + const half d = xb->d; + + for (int i=0;i<16;i++) { + reg[i/4][i%4] = (qs[i + 16*il] * d); + } +} + +template +void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { + const half d = xb->d; + const half min = xb->dmin; + device const uint8_t * q = (device const uint8_t *)xb->qs; + half dl, ml; + uint8_t sc = xb->scales[il]; + +#if QK_K == 256 + q = q + 32*(il/8) + 16*(il&1); + il = (il/2)%4; +#endif + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint8_t * q = (device const uint8_t *)xb->qs; + device const uint8_t * h = (device const uint8_t *)xb->hmask; + device const int8_t * scales = (device const int8_t *)xb->scales; + +#if QK_K == 256 + q = q + 32 * (il/8) + 16 * (il&1); + h = h + 16 * (il&1); + uint8_t m = 1 << (il/2); + uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ + ((il/4)>0 ? 12 : 3); + uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; + uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; + int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) + : (scale_2&kmask2) | ((scale_1&kmask1) << 4); + half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h); + const half ml = 4.h * dl; + + il = (il/2) & 3; + const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl *= coef; + + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml); + } +#else + float kcoef = il&1 ? 1.f/16.f : 1.f; + uint16_t kmask = il&1 ? 0xF0 : 0x0F; + float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8); + float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + uint8_t m = 1<<(il*2); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef)); + } +#endif +} + +static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) { + return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)} + : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))}; +} + +template +void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { + device const uchar * q = xb->qs; + +#if QK_K == 256 + short is = (il/4) * 2; + q = q + (il/4) * 32 + 16 * (il&1); + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const half d = il < 2 ? xb->d : xb->d / 16.h; + const half min = xb->dmin; + const half dl = d * sc[0]; + const half ml = min * sc[1]; +#else + q = q + 16 * (il&1); + device const uint8_t * s = xb->scales; + device const half2 * dh = (device const half2 *)xb->d; + const float2 d = (float2)dh[0]; + const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h; + const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4); +#endif + const ushort mask = il<2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { + device const uint8_t * q = xb->qs; + device const uint8_t * qh = xb->qh; + +#if QK_K == 256 + short is = (il/4) * 2; + q = q + 32 * (il/4) + 16 * (il&1); + qh = qh + 16 * (il&1); + uint8_t ul = 1 << (il/2); + il = il & 3; + const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); + const half d = il < 2 ? xb->d : xb->d / 16.h; + const half min = xb->dmin; + const half dl = d * sc[0]; + const half ml = min * sc[1]; + + const ushort mask = il<2 ? 0x0F : 0xF0; + const half qh_val = il<2 ? 16.h : 256.h; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; + } +#else + q = q + 16 * (il&1); + device const int8_t * s = xb->scales; + const float dl = xb->d * s[il]; + uint8_t m = 1<<(il*2); + const float coef = il<2 ? 1.f : 1.f/16.f; + const ushort mask = il<2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef)); + } +#endif +} + +template +void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { + const half d_all = xb->d; + device const uint8_t * ql = (device const uint8_t *)xb->ql; + device const uint8_t * qh = (device const uint8_t *)xb->qh; + device const int8_t * scales = (device const int8_t *)xb->scales; + +#if QK_K == 256 + ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); + qh = qh + 32*(il/8) + 16*(il&1); + half sc = scales[(il%2) + 2 * ((il/2))]; + il = (il/2) & 3; +#else + ql = ql + 16 * (il&1); + half sc = scales[il]; +#endif + const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; + const half coef = il>1 ? 1.f/16.h : 1.h; + const half ml = d_all * sc * 32.h; + const half dl = d_all * sc * coef; + for (int i = 0; i < 16; ++i) { + const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) + : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); + reg[i/4][i%4] = dl * q - ml; + } +} + +template +kernel void kernel_get_rows( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb1, + uint tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tptg[[threads_per_threadgroup]]) { + const int i = tgpig; + const int r = ((device int32_t *) src1)[i]; + + for (int ind = tiitg; ind < ne00/16; ind += tptg) { + float4x4 temp; + dequantize_func( + ((device const block_q *) ((device char *) src0 + r*nb01)) + ind/nl, ind%nl, temp); + *(((device float4x4 *) ((device char *) dst + i*nb1)) + ind) = temp; + } +} + +#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A +#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A +#define BLOCK_SIZE_K 32 +#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A +#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B +#define THREAD_PER_BLOCK 128 +#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers +#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers +#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8 +#define SG_MAT_ROW 8 + +// each block_q contains 16*nl weights +template +kernel void kernel_mul_mm(device const uchar * src0, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & gqa, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + threadgroup half * sa = (threadgroup half *)(shared_memory); + threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); + + const uint r0 = tgpig.y; + const uint r1 = tgpig.x; + const uint im = tgpig.z; + // if this block is of 64x32 shape or smaller + short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; + short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + // a thread shouldn't load data outside of the matrix + short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + + simdgroup_half8x8 ma[4]; + simdgroup_float8x8 mb[2]; + simdgroup_float8x8 c_res[8]; + for (int i = 0; i < 8; i++){ + c_res[i] = make_filled_simdgroup_matrix(0.f); + } + + short il = (tiitg % THREAD_PER_ROW); + + uint offset0 = im/gqa*nb02; + ushort offset1 = il/nl; + + device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; + device const float * y = (device const float *)(src1 + + nb12 * im + + nb11 * (r1 * BLOCK_SIZE_N + thread_col) + + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + + for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { + //load data and store to threadgroup memory + half4x4 temp_a; + dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); + #pragma unroll(16) + for (int i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ + + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \ + + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + } + *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \ + = *((device float2x4 *)y); + il = (il + 2 < nl) ? il + 2 : il % 2; + x = (il < 2) ? x + (2+nl-1)/nl : x; + y += BLOCK_SIZE_K; + + threadgroup_barrier(mem_flags::mem_threadgroup); + //load matrices from threadgroup memory and conduct outer products + threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); + threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + #pragma unroll(4) + for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + #pragma unroll(4) + for (int i = 0; i < 4; i++) { + simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i); + } + simdgroup_barrier(mem_flags::mem_none); + #pragma unroll(2) + for (int i = 0; i < 2; i++) { + simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i); + } + + lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; + lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; + #pragma unroll(8) + for (int i = 0; i < 8; i++){ + simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); + } + } + } + + if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) { + device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \ + + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + if (sgitg==0) { + for (int i = 0; i < n_rows; i++) { + for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) { + *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); + } + } + } } } + +#if QK_K == 256 +#define QK_NL 16 +#else +#define QK_NL 4 +#endif + +typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \ + constant uint64_t &, constant uint64_t &, uint, uint, uint); + +template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; + +typedef void (mat_mm_t)( + device const uchar * src0, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & nb10, + constant int64_t & nb11, + constant int64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & gqa, + threadgroup uchar *, uint3, uint, uint); + +template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 5af320b480729474f66c2834fd14dc364c3cddda..9225749dd48b4aa60c0aa9ceb7309f421b7dda7d 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -850,7 +850,7 @@ std::array mul_str_values = { "mul_f32", "float" }; -std::string& replace(std::string& s, const std::string& from, const std::string& to) { +static std::string& replace(std::string& s, const std::string& from, const std::string& to) { size_t pos = 0; while ((pos = s.find(from, pos)) != std::string::npos) { s.replace(pos, from.length(), to); @@ -859,7 +859,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string& return s; } -std::string generate_kernels() { +static std::string generate_kernels() { std::stringstream src; src << program_source << '\n'; src << k_quants_source << '\n'; @@ -1342,7 +1342,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) { return; } - cl_mem mem = (cl_mem)tensor->data; + cl_mem mem = (cl_mem)tensor->extra; clReleaseMemObject(mem); } @@ -1401,7 +1401,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, size_t d_size; cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0 - cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted. + cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted. cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst @@ -1499,9 +1499,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr size_t d_size; cl_mem d_X; if (src0->backend == GGML_BACKEND_GPU) { // NOLINT - d_X = (cl_mem) src0->data; + d_X = (cl_mem) src0->extra; } else { - d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); + d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); } cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); @@ -1529,7 +1529,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { - printf("\nF32 Matmul Failed (%d): [dims: %lld,%lld,%lld,%lld] You may be out of VRAM. Please check if you have enough.\n",status,ne00,ne01,ne10,ne11); + printf("\nF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); GGML_ASSERT(false); } @@ -1576,7 +1576,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr size_t d_size; cl_mem d_X; if (src0->backend == GGML_BACKEND_GPU) { // NOLINT - d_X = (cl_mem) src0->data; + d_X = (cl_mem) src0->extra; } else { d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); } @@ -1634,7 +1634,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { - printf("\nF16 Matmul Failed (%d): [dims: %lld,%lld,%lld,%lld] You may be out of VRAM. Please check if you have enough.\n",status,ne00,ne01,ne10,ne11); + printf("\nF16 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); GGML_ASSERT(false); } @@ -1707,7 +1707,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * events.emplace_back(); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); } else if (src0->backend == GGML_BACKEND_GPU) { - d_Q = (cl_mem) src0->data; + d_Q = (cl_mem) src0->extra; } else { GGML_ASSERT(false); } @@ -1754,7 +1754,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * &queue, events.data() + ev_idx++); if (status != clblast::StatusCode::kSuccess) { - printf("\nQF32 Matmul Failed (%d): [dims: %lld,%lld,%lld,%lld] You may be out of VRAM. Please check if you have enough.\n",status,ne00,ne01,ne10,ne11); + printf("\nQF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); GGML_ASSERT(false); } } @@ -1808,7 +1808,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens return false; } -bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { +static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { // If device doesn't support FP16 if (!fp16_support) { return false; @@ -1880,6 +1880,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { CL_CHECK(clFinish(queue)); - tensor->data = dst; + tensor->extra = dst; GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); } diff --git a/ggml.c b/ggml.c index 3c2792f8d0feff3ff4c37777c63c56b3b41129ed..81c40041e4deef5f7608b67ff11ed87f4e168add 100644 --- a/ggml.c +++ b/ggml.c @@ -1,4 +1,3 @@ -#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" @@ -47,6 +46,10 @@ // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) #pragma warning(disable: 4244 4267) + +// disable POSIX deprecation warnigns +// these functions are never going away, anyway +#pragma warning(disable: 4996) #endif #if defined(_WIN32) @@ -103,6 +106,9 @@ typedef void * thread_ret_t; #include #include +#endif +#ifdef GGML_USE_CPU_HBM +#include #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 @@ -123,6 +129,8 @@ typedef void * thread_ret_t; #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 #define GGML_SILU_FP16 +// #define GGML_CROSS_ENTROPY_EXP_FP16 +// #define GGML_FLASH_ATTN_EXP_FP16 #define GGML_SOFT_MAX_UNROLL 4 #define GGML_VEC_DOT_UNROLL 2 @@ -157,12 +165,6 @@ typedef void * thread_ret_t; //#define GGML_SOFT_MAX_ACCELERATE #endif -#if UINTPTR_MAX == 0xFFFFFFFF - #define GGML_MEM_ALIGN 4 -#else - #define GGML_MEM_ALIGN 16 -#endif - // // logging // @@ -192,13 +194,19 @@ typedef void * thread_ret_t; // #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) -#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) +#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) +#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else inline static void * ggml_aligned_malloc(size_t size) { + if (size == 0) { + GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); + return NULL; + } void * aligned_memory = NULL; -#ifdef GGML_USE_METAL - int result = posix_memalign(&aligned_memory, getpagesize(), size); +#ifdef GGML_USE_CPU_HBM + int result = hbw_posix_memalign(&aligned_memory, 16, size); +#elif GGML_USE_METAL + int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); #else int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); #endif @@ -213,14 +221,17 @@ inline static void * ggml_aligned_malloc(size_t size) { error_desc = "insufficient memory"; break; } - GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", - __func__, error_desc, size/(1024.0*1024.0)); + GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); return NULL; } return aligned_memory; } -#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) -#define GGML_ALIGNED_FREE(ptr) free(ptr) +#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) +#ifdef GGML_USE_CPU_HBM +#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) +#else +#define GGML_ALIGNED_FREE(ptr) free(ptr) +#endif #endif #define UNUSED GGML_UNUSED @@ -272,7 +283,7 @@ typedef double ggml_float; // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t -#ifdef __ARM_NEON +#if defined(__ARM_NEON) && !defined(_MSC_VER) // if YCM cannot find , make a symbolic link to it, for example: // @@ -299,12 +310,18 @@ typedef double ggml_float; #if defined(_MSC_VER) || defined(__MINGW32__) #include #else +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) #if !defined(__riscv) #include #endif #endif #endif #endif +#endif + +#ifdef __riscv_v_intrinsic +#include +#endif #ifdef __F16C__ @@ -819,46 +836,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #if !defined(__aarch64__) -inline static uint16_t vaddvq_u8(uint8x16_t v) { - return - (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + - (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + - (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + - (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + - (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + - (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + - (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + - (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); -} - -inline static int16_t vaddvq_s8(int8x16_t v) { - return - (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) + - (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) + - (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) + - (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) + - (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) + - (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) + - (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) + - (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15); -} - -inline static int32_t vaddvq_s16(int16x8_t v) { - return - (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + - (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + - (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + - (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); -} - -inline static uint32_t vaddvq_u16(uint16x8_t v) { - return - (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + - (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + - (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + - (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); -} - inline static int32_t vaddvq_s32(int32x4_t v) { return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); } @@ -867,12 +844,6 @@ inline static float vaddvq_f32(float32x4_t v) { return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); } -inline static float vminvq_f32(float32x4_t v) { - return - MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - inline static float vmaxvq_f32(float32x4_t v) { return MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), @@ -1644,11 +1615,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { + [GGML_TYPE_I8] = { + .type_name = "i8", + .blck_size = 1, + .type_size = sizeof(int8_t), + .is_quantized = false, + }, + [GGML_TYPE_I16] = { + .type_name = "i16", + .blck_size = 1, + .type_size = sizeof(int16_t), + .is_quantized = false, + }, + [GGML_TYPE_I32] = { + .type_name = "i32", + .blck_size = 1, + .type_size = sizeof(int32_t), + .is_quantized = false, + }, [GGML_TYPE_F32] = { + .type_name = "f32", + .blck_size = 1, + .type_size = sizeof(float), + .is_quantized = false, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, }, [GGML_TYPE_F16] = { + .type_name = "f16", + .blck_size = 1, + .type_size = sizeof(ggml_fp16_t), + .is_quantized = false, .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, @@ -1656,6 +1653,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_F16, }, [GGML_TYPE_Q4_0] = { + .type_name = "q4_0", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_0, .from_float = quantize_row_q4_0, .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, @@ -1663,6 +1664,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q4_1] = { + .type_name = "q4_1", + .blck_size = QK4_1, + .type_size = sizeof(block_q4_1), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_1, .from_float = quantize_row_q4_1, .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, @@ -1670,6 +1675,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q5_0] = { + .type_name = "q5_0", + .blck_size = QK5_0, + .type_size = sizeof(block_q5_0), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_0, .from_float = quantize_row_q5_0, .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, @@ -1677,6 +1686,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q5_1] = { + .type_name = "q5_1", + .blck_size = QK5_1, + .type_size = sizeof(block_q5_1), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_1, .from_float = quantize_row_q5_1, .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, @@ -1684,6 +1697,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q8_0] = { + .type_name = "q8_0", + .blck_size = QK8_0, + .type_size = sizeof(block_q8_0), + .is_quantized = true, .to_float = dequantize_row_q8_0, .from_float = quantize_row_q8_0, .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, @@ -1691,12 +1708,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q8_1] = { + .type_name = "q8_1", + .blck_size = QK8_1, + .type_size = sizeof(block_q8_1), + .is_quantized = true, .from_float = quantize_row_q8_1, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, }, #ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = { + .type_name = "q2_K", + .blck_size = QK_K, + .type_size = sizeof(block_q2_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q2_K, .from_float = quantize_row_q2_K, .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, @@ -1704,6 +1729,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q3_K] = { + .type_name = "q3_K", + .blck_size = QK_K, + .type_size = sizeof(block_q3_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q3_K, .from_float = quantize_row_q3_K, .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, @@ -1711,6 +1740,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q4_K] = { + .type_name = "q4_K", + .blck_size = QK_K, + .type_size = sizeof(block_q4_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_K, .from_float = quantize_row_q4_K, .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, @@ -1718,6 +1751,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q5_K] = { + .type_name = "q5_K", + .blck_size = QK_K, + .type_size = sizeof(block_q5_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_K, .from_float = quantize_row_q5_K, .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, @@ -1725,6 +1762,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q6_K] = { + .type_name = "q6_K", + .blck_size = QK_K, + .type_size = sizeof(block_q6_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q6_K, .from_float = quantize_row_q6_K, .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, @@ -1732,15 +1773,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q8_K] = { + .type_name = "q8_K", + .blck_size = QK_K, + .type_size = sizeof(block_q8_K), + .is_quantized = true, .from_float = quantize_row_q8_K, } #endif }; // For internal test use -ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) { - GGML_ASSERT(i < GGML_TYPE_COUNT); - return type_traits[i]; +ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { + GGML_ASSERT(type < GGML_TYPE_COUNT); + return type_traits[type]; } @@ -2364,7 +2409,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -2373,6 +2417,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q4_0 * restrict x0 = &x[i + 0]; const block_q4_0 * restrict x1 = &x[i + 1]; @@ -2551,6 +2596,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * } // Main loop + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 2; i < nb; i+=2) { _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); @@ -2608,6 +2654,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -2634,7 +2715,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -2646,6 +2726,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * float summs = 0; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict x1 = &x[i + 1]; @@ -2734,6 +2815,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -2760,7 +2873,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); assert(qk == QK5_0); const block_q5_0 * restrict x = vx; @@ -2776,6 +2888,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * uint64_t tmp0[4]; uint64_t tmp1[4]; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q5_0 * restrict x0 = &x[i]; const block_q5_0 * restrict x1 = &x[i + 1]; @@ -2968,6 +3081,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for masking and shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl); + vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl); + vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl); + + // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl); + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + + // ((qh & (1u << (j + 16))) >> (j + 12)); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl); + vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3000,7 +3183,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); assert(qk == QK5_1); const block_q5_1 * restrict x = vx; @@ -3019,6 +3201,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * uint64_t tmp0[4]; uint64_t tmp1[4]; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q5_1 * restrict x0 = &x[i]; const block_q5_1 * restrict x1 = &x[i + 1]; @@ -3224,6 +3407,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl); + + // load qh + vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl); + + // ((qh >> (j + 0)) << 4) & 0x10; + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl); + + // ((qh >> (j + 12)) ) & 0x10; + vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3256,7 +3505,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -3265,6 +3513,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q8_0 * restrict x0 = &x[i + 0]; const block_q8_0 * restrict x1 = &x[i + 1]; @@ -3335,6 +3584,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + size_t vl = __riscv_vsetvl_e8m1(qk); + + for (int i = 0; i < nb; i++) { + // load elements + vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3482,9 +3751,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } -static const float GELU_COEF_A = 0.044715f; -static const float GELU_QUICK_COEF = -1.702f; -static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; +static const float GELU_COEF_A = 0.044715f; +static const float GELU_QUICK_COEF = -1.702f; +static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; inline static float ggml_gelu_f32(float x) { return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); @@ -3653,95 +3922,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { // data types // -static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = 1, - [GGML_TYPE_F16] = 1, - [GGML_TYPE_Q4_0] = QK4_0, - [GGML_TYPE_Q4_1] = QK4_1, - [GGML_TYPE_Q5_0] = QK5_0, - [GGML_TYPE_Q5_1] = QK5_1, - [GGML_TYPE_Q8_0] = QK8_0, - [GGML_TYPE_Q8_1] = QK8_1, -#ifdef GGML_USE_K_QUANTS - [GGML_TYPE_Q2_K] = QK_K, - [GGML_TYPE_Q3_K] = QK_K, - [GGML_TYPE_Q4_K] = QK_K, - [GGML_TYPE_Q5_K] = QK_K, - [GGML_TYPE_Q6_K] = QK_K, - [GGML_TYPE_Q8_K] = QK_K, -#endif - [GGML_TYPE_I8] = 1, - [GGML_TYPE_I16] = 1, - [GGML_TYPE_I32] = 1, -}; -static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated"); - -static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = sizeof(float), - [GGML_TYPE_F16] = sizeof(ggml_fp16_t), - [GGML_TYPE_Q4_0] = sizeof(block_q4_0), - [GGML_TYPE_Q4_1] = sizeof(block_q4_1), - [GGML_TYPE_Q5_0] = sizeof(block_q5_0), - [GGML_TYPE_Q5_1] = sizeof(block_q5_1), - [GGML_TYPE_Q8_0] = sizeof(block_q8_0), - [GGML_TYPE_Q8_1] = sizeof(block_q8_1), -#ifdef GGML_USE_K_QUANTS - [GGML_TYPE_Q2_K] = sizeof(block_q2_K), - [GGML_TYPE_Q3_K] = sizeof(block_q3_K), - [GGML_TYPE_Q4_K] = sizeof(block_q4_K), - [GGML_TYPE_Q5_K] = sizeof(block_q5_K), - [GGML_TYPE_Q6_K] = sizeof(block_q6_K), - [GGML_TYPE_Q8_K] = sizeof(block_q8_K), -#endif - [GGML_TYPE_I8] = sizeof(int8_t), - [GGML_TYPE_I16] = sizeof(int16_t), - [GGML_TYPE_I32] = sizeof(int32_t), -}; -static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated"); - - -static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = "f32", - [GGML_TYPE_F16] = "f16", - [GGML_TYPE_Q4_0] = "q4_0", - [GGML_TYPE_Q4_1] = "q4_1", - [GGML_TYPE_Q5_0] = "q5_0", - [GGML_TYPE_Q5_1] = "q5_1", - [GGML_TYPE_Q8_0] = "q8_0", - [GGML_TYPE_Q8_1] = "q8_1", - [GGML_TYPE_Q2_K] = "q2_K", - [GGML_TYPE_Q3_K] = "q3_K", - [GGML_TYPE_Q4_K] = "q4_K", - [GGML_TYPE_Q5_K] = "q5_K", - [GGML_TYPE_Q6_K] = "q6_K", - [GGML_TYPE_Q8_K] = "q8_K", - [GGML_TYPE_I8] = "i8", - [GGML_TYPE_I16] = "i16", - [GGML_TYPE_I32] = "i32", -}; -static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated"); - -static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = false, - [GGML_TYPE_F16] = false, - [GGML_TYPE_Q4_0] = true, - [GGML_TYPE_Q4_1] = true, - [GGML_TYPE_Q5_0] = true, - [GGML_TYPE_Q5_1] = true, - [GGML_TYPE_Q8_0] = true, - [GGML_TYPE_Q8_1] = true, - [GGML_TYPE_Q2_K] = true, - [GGML_TYPE_Q3_K] = true, - [GGML_TYPE_Q4_K] = true, - [GGML_TYPE_Q5_K] = true, - [GGML_TYPE_Q6_K] = true, - [GGML_TYPE_Q8_K] = true, - [GGML_TYPE_I8] = false, - [GGML_TYPE_I16] = false, - [GGML_TYPE_I32] = false, -}; -static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated"); - static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "NONE", @@ -3761,10 +3941,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ARGMAX", "REPEAT", "REPEAT_BACK", + "CONCAT", "SILU_BACK", "NORM", "RMS_NORM", "RMS_NORM_BACK", + "GROUP_NORM", "MUL_MAT", "OUT_PROD", @@ -3790,20 +3972,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CLAMP", "CONV_1D", "CONV_2D", + "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", + "UPSCALE", "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", "WIN_PART", "WIN_UNPART", + "GET_REL_POS", + "ADD_REL_POS", "UNARY", "MAP_UNARY", "MAP_BINARY", + "MAP_CUSTOM1_F32", + "MAP_CUSTOM2_F32", + "MAP_CUSTOM3_F32", + "MAP_CUSTOM1", "MAP_CUSTOM2", "MAP_CUSTOM3", @@ -3812,7 +4002,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3833,10 +4023,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "argmax(x)", "repeat(x)", "repeat_back(x)", + "concat(x, y)", "silu_back(x)", "norm(x)", "rms_norm(x)", "rms_norm_back(x)", + "group_norm(x)", "X*Y", "X*Y", @@ -3862,20 +4054,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "clamp(x)", "conv_1d(x)", "conv_2d(x)", + "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", + "upscale(x)", "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", "win_part(x)", "win_unpart(x)", + "get_rel_pos(x)", + "add_rel_pos(x)", "unary(x)", "f(x)", "f(x,y)", + "custom_f32(x)", + "custom_f32(x,y)", + "custom_f32(x,y,z)", + "custom(x)", "custom(x,y)", "custom(x,y,z)", @@ -3884,7 +4084,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -3914,8 +4114,10 @@ static void ggml_setup_op_has_task_pass(void) { p[GGML_OP_DIAG_MASK_ZERO ] = true; p[GGML_OP_CONV_1D ] = true; p[GGML_OP_CONV_2D ] = true; + p[GGML_OP_CONV_TRANSPOSE_2D ] = true; p[GGML_OP_FLASH_ATTN_BACK ] = true; p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + p[GGML_OP_ADD_REL_POS ] = true; } { // FINALIZE @@ -4102,38 +4304,52 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) { } size_t ggml_nbytes(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + size_t nbytes; + size_t blck_size = ggml_blck_size(tensor->type); + if (blck_size == 1) { + nbytes = ggml_type_size(tensor->type); + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } + else { + nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } - // this should handle cases where the tensor is not contiguous in memory - // probaby just: - // - // return tensor->ne[3]*tensor->nb[3] - // - // is enough, but just in case, adding the second part + return nbytes; +} - return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN); +size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { + return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]; + return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type); } int ggml_blck_size(enum ggml_type type) { - return GGML_BLCK_SIZE[type]; + return type_traits[type].blck_size; } size_t ggml_type_size(enum ggml_type type) { - return GGML_TYPE_SIZE[type]; + return type_traits[type].type_size; } float ggml_type_sizef(enum ggml_type type) { - return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type]; + return ((float)(type_traits[type].type_size))/type_traits[type].blck_size; } const char * ggml_type_name(enum ggml_type type) { - return GGML_TYPE_NAME[type]; + return type_traits[type].type_name; +} + +bool ggml_is_quantized(enum ggml_type type) { + return type_traits[type].is_quantized; } const char * ggml_op_name(enum ggml_op op) { @@ -4145,7 +4361,7 @@ const char * ggml_op_symbol(enum ggml_op op) { } size_t ggml_element_size(const struct ggml_tensor * tensor) { - return GGML_TYPE_SIZE[tensor->type]; + return ggml_type_size(tensor->type); } static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { @@ -4183,10 +4399,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct (t0->ne[3] == t1->ne[3]); } -bool ggml_is_quantized(enum ggml_type type) { - return GGML_IS_QUANTIZED[type]; -} - enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { enum ggml_type wtype = GGML_TYPE_COUNT; @@ -4224,8 +4436,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && - tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] && + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } @@ -4234,7 +4446,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && + tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } @@ -4249,7 +4461,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && + tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } @@ -4384,6 +4596,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { return NULL; } + // allow to call ggml_init with 0 size + if (params.mem_size == 0) { + params.mem_size = GGML_MEM_ALIGN; + } + const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN); *ctx = (struct ggml_context) { @@ -4561,36 +4778,51 @@ static struct ggml_tensor * ggml_new_tensor_impl( enum ggml_type type, int n_dims, const int64_t * ne, - void * data) { + struct ggml_tensor * view_src, + size_t view_offs) { assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); - size_t data_size = 0; + // find the base tensor and absolute offset + if (view_src != NULL && view_src->view_src != NULL) { + view_offs += view_src->view_offs; + view_src = view_src->view_src; + } - if (data == NULL && !ctx->no_alloc) { - data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]); - for (int i = 1; i < n_dims; i++) { - data_size *= ne[i]; - } + size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); + for (int i = 1; i < n_dims; i++) { + data_size *= ne[i]; } - if (ctx->scratch.data != NULL && data == NULL) { - // allocate tensor data in the scratch buffer - if (ctx->scratch.offs + data_size > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", - __func__, ctx->scratch.offs + data_size, ctx->scratch.size); - assert(false); - return NULL; - } + GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src)); - data = (char * const) ctx->scratch.data + ctx->scratch.offs; + void * data = view_src != NULL ? view_src->data : NULL; + if (data != NULL) { + data = (char *) data + view_offs; + } + + size_t obj_alloc_size = 0; + + if (view_src == NULL && !ctx->no_alloc) { + if (ctx->scratch.data != NULL) { + // allocate tensor data in the scratch buffer + if (ctx->scratch.offs + data_size > ctx->scratch.size) { + GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + __func__, ctx->scratch.offs + data_size, ctx->scratch.size); + assert(false); + return NULL; + } - ctx->scratch.offs += data_size; + data = (char * const) ctx->scratch.data + ctx->scratch.offs; - data_size = 0; + ctx->scratch.offs += data_size; + } else { + // allocate tensor data in the context's memory pool + obj_alloc_size = data_size; + } } - struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size); + struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here @@ -4610,7 +4842,9 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, - /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, + /*.view_src =*/ view_src, + /*.view_offs =*/ view_offs, + /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, /*.padding =*/ { 0 }, @@ -4623,8 +4857,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( result->ne[i] = ne[i]; } - result->nb[0] = GGML_TYPE_SIZE[type]; - result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]); + result->nb[0] = ggml_type_size(type); + result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type)); for (int i = 2; i < GGML_MAX_DIMS; i++) { result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; } @@ -4634,28 +4868,12 @@ static struct ggml_tensor * ggml_new_tensor_impl( return result; } -static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { - GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings - assert(params_size <= GGML_MAX_OP_PARAMS); - memcpy(tensor->op_params, params, params_size); -} - -static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { - assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); - return ((const int32_t *)(tensor->op_params))[i]; -} - -static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { - assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); - ((int32_t *)(tensor->op_params))[i] = value; -} - struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, int n_dims, const int64_t * ne) { - return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); + return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0); } struct ggml_tensor * ggml_new_tensor_1d( @@ -4720,7 +4938,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { } struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); + return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne); +} + +static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings + assert(params_size <= GGML_MAX_OP_PARAMS); + memcpy(tensor->op_params, params, params_size); +} + +static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + return ((const int32_t *)(tensor->op_params))[i]; +} + +static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + ((int32_t *)(tensor->op_params))[i] = value; } struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { @@ -5006,14 +5240,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, - const struct ggml_tensor * src) { - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + struct ggml_tensor * src) { + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0); ggml_format_name(result, "%s (view)", src->name); - result->nb[0] = src->nb[0]; - result->nb[1] = src->nb[1]; - result->nb[2] = src->nb[2]; - result->nb[3] = src->nb[3]; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = src->nb[i]; + } return result; } @@ -5271,7 +5504,7 @@ static struct ggml_tensor * ggml_mul_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -5314,7 +5547,7 @@ static struct ggml_tensor * ggml_div_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -5546,10 +5779,6 @@ struct ggml_tensor * ggml_repeat( is_node = true; } - if (ggml_are_same_shape(a, b) && !is_node) { - return a; - } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); result->op = GGML_OP_REPEAT; @@ -5588,6 +5817,30 @@ struct ggml_tensor * ggml_repeat_back( return result; } +// ggml_concat + +struct ggml_tensor * ggml_concat( + struct ggml_context* ctx, + struct ggml_tensor* a, + struct ggml_tensor* b) { + GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]); + + result->op = GGML_OP_CONCAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_abs struct ggml_tensor * ggml_abs( @@ -5756,6 +6009,7 @@ struct ggml_tensor * ggml_silu_back( static struct ggml_tensor * ggml_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, + float eps, bool inplace) { bool is_node = false; @@ -5766,7 +6020,7 @@ static struct ggml_tensor * ggml_norm_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - // TODO: maybe store epsilon here? + ggml_set_op_params(result, &eps, sizeof(eps)); result->op = GGML_OP_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5777,16 +6031,20 @@ static struct ggml_tensor * ggml_norm_impl( struct ggml_tensor * ggml_norm( struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_norm_impl(ctx, a, false); + struct ggml_tensor * a, + float eps) { + return ggml_norm_impl(ctx, a, eps, false); } struct ggml_tensor * ggml_norm_inplace( struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_norm_impl(ctx, a, true); + struct ggml_tensor * a, + float eps) { + return ggml_norm_impl(ctx, a, eps, true); } +// ggml_rms_norm + static struct ggml_tensor * ggml_rms_norm_impl( struct ggml_context * ctx, struct ggml_tensor * a, @@ -5823,10 +6081,13 @@ struct ggml_tensor * ggml_rms_norm_inplace( return ggml_rms_norm_impl(ctx, a, eps, true); } +// ggml_rms_norm_back + struct ggml_tensor * ggml_rms_norm_back( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * b, + float eps) { bool is_node = false; if (a->grad) { @@ -5836,6 +6097,8 @@ struct ggml_tensor * ggml_rms_norm_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + ggml_set_op_params(result, &eps, sizeof(eps)); + result->op = GGML_OP_RMS_NORM_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; @@ -5844,6 +6107,44 @@ struct ggml_tensor * ggml_rms_norm_back( return result; } +// ggml_group_norm + +static struct ggml_tensor * ggml_group_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + bool inplace) { + + bool is_node = false; + if (!inplace && (a->grad)) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_GROUP_NORM; + result->op_params[0] = n_groups; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; // TODO: maybe store epsilon here? + + return result; +} + +struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups) { + return ggml_group_norm_impl(ctx, a, n_groups, false); +} + +struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups) { + return ggml_group_norm_impl(ctx, a, n_groups, true); +} // ggml_mul_mat @@ -6127,7 +6428,7 @@ struct ggml_tensor * ggml_reshape( //GGML_ASSERT(false); } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6151,7 +6452,7 @@ struct ggml_tensor * ggml_reshape_1d( } const int64_t ne[1] = { ne0 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6176,7 +6477,7 @@ struct ggml_tensor * ggml_reshape_2d( } const int64_t ne[2] = { ne0, ne1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6202,7 +6503,7 @@ struct ggml_tensor * ggml_reshape_3d( } const int64_t ne[3] = { ne0, ne1, ne2 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6212,7 +6513,6 @@ struct ggml_tensor * ggml_reshape_3d( return result; } - struct ggml_tensor * ggml_reshape_4d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -6230,7 +6530,7 @@ struct ggml_tensor * ggml_reshape_4d( } const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6240,46 +6540,40 @@ struct ggml_tensor * ggml_reshape_4d( return result; } -// ggml_view_1d - -static struct ggml_tensor * ggml_view_tensor_offset( +static struct ggml_tensor * ggml_view_impl( struct ggml_context * ctx, struct ggml_tensor * a, int n_dims, const int64_t * ne, size_t offset) { - // don't calculate an offset from an unallocated tensor - void * data = NULL; - if (a->data != NULL) { - data = (char *) a->data + offset; - } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data); + bool is_node = false; + + if (a->grad) { + is_node = true; + } + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset); ggml_format_name(result, "%s (view)", a->name); ggml_set_op_params(result, &offset, sizeof(offset)); + result->op = GGML_OP_VIEW; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + return result; } +// ggml_view_1d + struct ggml_tensor * ggml_view_1d( struct ggml_context * ctx, struct ggml_tensor * a, int64_t ne0, size_t offset) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset); - - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; + struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset); return result; } @@ -6294,24 +6588,14 @@ struct ggml_tensor * ggml_view_2d( size_t nb1, size_t offset) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; + const int64_t ne[2] = { ne0, ne1 }; - struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset); result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; result->nb[3] = result->nb[2]; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - return result; } @@ -6327,24 +6611,14 @@ struct ggml_tensor * ggml_view_3d( size_t nb2, size_t offset) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; + const int64_t ne[3] = { ne0, ne1, ne2 }; - struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset); result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = result->nb[2]*ne2; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - return result; } @@ -6362,24 +6636,14 @@ struct ggml_tensor * ggml_view_4d( size_t nb3, size_t offset) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; + const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset); result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = nb3; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - return result; } @@ -6566,7 +6830,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[] = { n_past, inplace ? 1 : 0 }; + int32_t params[] = { n_past }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_INF; @@ -6583,7 +6847,6 @@ struct ggml_tensor * ggml_diag_mask_inf( return ggml_diag_mask_inf_impl(ctx, a, n_past, false); } - struct ggml_tensor * ggml_diag_mask_inf_inplace( struct ggml_context * ctx, struct ggml_tensor * a, @@ -6606,7 +6869,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[] = { n_past, inplace ? 1 : 0 }; + int32_t params[] = { n_past }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_ZERO; @@ -6712,6 +6975,8 @@ static struct ggml_tensor * ggml_rope_impl( int n_ctx, float freq_base, float freq_scale, + float xpos_base, + bool xpos_down, bool inplace) { GGML_ASSERT(n_past >= 0); bool is_node = false; @@ -6722,9 +6987,11 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[6] = { n_past, n_dims, mode, n_ctx }; + int32_t params[8] = { n_past, n_dims, mode, n_ctx }; memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &xpos_base, sizeof(float)); + memcpy(params + 7, &xpos_down, sizeof(bool)); ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; @@ -6741,7 +7008,7 @@ struct ggml_tensor * ggml_rope( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); } struct ggml_tensor * ggml_rope_inplace( @@ -6751,7 +7018,7 @@ struct ggml_tensor * ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); } struct ggml_tensor * ggml_rope_custom( @@ -6763,7 +7030,7 @@ struct ggml_tensor * ggml_rope_custom( int n_ctx, float freq_base, float freq_scale) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); } struct ggml_tensor * ggml_rope_custom_inplace( @@ -6775,7 +7042,17 @@ struct ggml_tensor * ggml_rope_custom_inplace( int n_ctx, float freq_base, float freq_scale) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); +} + +struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + float base, + bool down) { + return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); } // ggml_rope_back @@ -6786,7 +7063,11 @@ struct ggml_tensor * ggml_rope_back( int n_past, int n_dims, int mode, - int n_ctx) { + int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down) { GGML_ASSERT(n_past >= 0); GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); @@ -6798,7 +7079,11 @@ struct ggml_tensor * ggml_rope_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - int32_t params[] = { n_past, n_dims, mode, n_ctx }; + int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &xpos_base, sizeof(float)); + memcpy(params + 7, &xpos_down, sizeof(bool)); ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE_BACK; @@ -6905,6 +7190,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d( return result; } +// ggml_conv_1d_ph + +struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d) { + return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); +} + // ggml_conv_2d struct ggml_tensor * ggml_conv_2d( @@ -6945,17 +7241,61 @@ struct ggml_tensor * ggml_conv_2d( } -// ggml_conv_1d_ph +// ggml_conv_2d_sk_p0 -struct ggml_tensor * ggml_conv_1d_ph( +struct ggml_tensor * ggml_conv_2d_sk_p0( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b, - int s, - int d) { - return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); + struct ggml_tensor * b) { + return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1); +} + +// ggml_conv_2d_s1_ph + +struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1); +} + +// ggml_conv_transpose_2d_p0 + +static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { + return (ins - 1) * s - 2 * p + ks; } +struct ggml_tensor * ggml_conv_transpose_2d_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride) { + GGML_ASSERT(a->ne[3] == b->ne[2]); + + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/), + ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/), + a->ne[2], b->ne[3], + }; + + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + ggml_set_op_params_i32(result, 0, stride); + + result->op = GGML_OP_CONV_TRANSPOSE_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} // ggml_pool_* @@ -7033,6 +7373,40 @@ struct ggml_tensor * ggml_pool_2d( return result; } +// ggml_upscale + +static struct ggml_tensor * ggml_upscale_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor) { + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0] * scale_factor, + a->ne[1] * scale_factor, + a->ne[2], a->ne[3]); + + result->op = GGML_OP_UPSCALE; + result->op_params[0] = scale_factor; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; + + return result; +} + +struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor) { + return ggml_upscale_impl(ctx, a, scale_factor); +} + // ggml_flash_attn struct ggml_tensor * ggml_flash_attn( @@ -7231,6 +7605,87 @@ struct ggml_tensor * ggml_win_unpart( return result; } +// ggml_get_rel_pos + +struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh) { + GGML_ASSERT(qh == kh); + GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]); + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], kh, qh, 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne); + + result->op = GGML_OP_GET_REL_POS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; + + return result; +} + +// ggml_add_rel_pos + +static struct ggml_tensor * ggml_add_rel_pos_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(pw, ph)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(pw)); + GGML_ASSERT(ggml_is_contiguous(ph)); + GGML_ASSERT(ph->type == GGML_TYPE_F32); + GGML_ASSERT(pw->type == GGML_TYPE_F32); + GGML_ASSERT(pw->ne[3] == a->ne[2]); + GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]); + GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]); + + bool is_node = false; + + if (!inplace && (a->grad || pw->grad || ph->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_set_op_params_i32(result, 0, inplace ? 1 : 0); + + result->op = GGML_OP_ADD_REL_POS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = pw; + result->src[2] = ph; + + return result; +} + + +struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph) { + return ggml_add_rel_pos_impl(ctx, a, pw, ph, false); +} + +struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph) { + return ggml_add_rel_pos_impl(ctx, a, pw, ph, true); +} + // gmml_unary static struct ggml_tensor * ggml_unary_impl( @@ -7746,7 +8201,7 @@ static void ggml_compute_forward_dup_same_cont( memcpy( ((char *) dst->data + ie0*nb0), ((char *) src0->data + ie0*nb00), - (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); + (ie1 - ie0) * ggml_type_size(src0->type)); } } @@ -7780,7 +8235,7 @@ static void ggml_compute_forward_dup_f16( if (src0->type == dst->type && ne00 == ne0 && - nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -7838,7 +8293,7 @@ static void ggml_compute_forward_dup_f16( float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; size_t id = 0; - size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -8051,7 +8506,7 @@ static void ggml_compute_forward_dup_f32( if (src0->type == dst->type && ne00 == ne0 && - nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -8090,7 +8545,7 @@ static void ggml_compute_forward_dup_f32( ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; size_t id = 0; - size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -8502,7 +8957,7 @@ static void ggml_compute_forward_add_q_f32( ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted @@ -8776,7 +9231,7 @@ static void ggml_compute_forward_add1_q_f32( ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 - GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == ggml_type_size(type)); // dst cannot be transposed or permuted GGML_ASSERT(nb0 <= nb1); @@ -9138,6 +9593,8 @@ static void ggml_compute_forward_mul( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); + switch (src0->type) { case GGML_TYPE_F32: { @@ -9180,6 +9637,8 @@ static void ggml_compute_forward_div_f32( #ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_div_f32); + vDSP_vdiv( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, @@ -9732,6 +10191,72 @@ static void ggml_compute_forward_repeat_back( } } +// ggml_compute_forward_concat + +static void ggml_compute_forward_concat_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + + GGML_TENSOR_BINARY_OP_LOCALS; + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ith; i2 < ne2; i2++) { + if (i2 < ne02) { // src0 + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03); + + float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); + *y = *x; + } + } + } // src1 + else { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13); + + float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); + *y = *x; + } + } + } + } + } +} + +static void ggml_compute_forward_concat( + const struct ggml_compute_params* params, + const struct ggml_tensor* src0, + const struct ggml_tensor* src1, + struct ggml_tensor* dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_concat_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_abs static void ggml_compute_forward_abs_f32( @@ -10286,7 +10811,8 @@ static void ggml_compute_forward_norm_f32( GGML_TENSOR_UNARY_OP_LOCALS; - const float eps = 1e-5f; // TODO: make this a parameter + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -10335,6 +10861,8 @@ static void ggml_compute_forward_norm( } } +// ggml_compute_forward_group_rms_norm + static void ggml_compute_forward_rms_norm_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10399,7 +10927,6 @@ static void ggml_compute_forward_rms_norm( } } - static void ggml_compute_forward_rms_norm_back_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10418,7 +10945,8 @@ static void ggml_compute_forward_rms_norm_back_f32( GGML_TENSOR_BINARY_OP_LOCALS; - const float eps = 1e-6f; // TODO: make this a parameter + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -10573,21 +11101,111 @@ static void ggml_compute_forward_rms_norm_back( } } -// ggml_compute_forward_mul_mat +// ggml_compute_forward_group_norm -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas( - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; +static void ggml_compute_forward_group_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - const int64_t ne10 = src1->ne[0]; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } - const int64_t ne0 = dst->ne[0]; + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS; + + const float eps = 1e-6f; // TODO: make this a parameter + + // TODO: optimize + + int n_channels = src0->ne[2]; + int n_groups = dst->op_params[0]; + int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; + for (int i = ith; i < n_groups; i+=nth) { + int start = i * n_channels_per_group; + int end = start + n_channels_per_group; + if (end > n_channels) { + end = n_channels; + } + int step = end - start; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + ggml_float sum = 0.0; + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += (ggml_float)x[i00]; + } + } + } + float mean = sum / (ne00 * ne01 * step); + ggml_float sum2 = 0.0; + + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); + + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + + for (int64_t i00 = 0; i00 < ne00; i00++) { + float v = x[i00] - mean; + y[i00] = v; + sum2 += (ggml_float)(v * v); + } + } + } + float variance = sum2 / (ne00 * ne01 * step); + const float scale = 1.0f / sqrtf(variance + eps); + + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + ggml_vec_scale_f32(ne00, y, scale); + } + } + } + } +} + +static void ggml_compute_forward_group_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_group_norm_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_mul_mat + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +static bool ggml_compute_forward_mul_mat_use_blas( + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + //const int64_t ne00 = src0->ne[0]; + //const int64_t ne01 = src0->ne[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these @@ -10630,7 +11248,7 @@ static void ggml_compute_forward_mul_mat( GGML_ASSERT(ne3 == ne13); // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted @@ -10639,6 +11257,10 @@ static void ggml_compute_forward_mul_mat( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + // nb01 >= nb00 - src0 is not transposed // compute by src0 rows @@ -10653,11 +11275,6 @@ static void ggml_compute_forward_mul_mat( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension - // ref: https://github.com/ggerganov/ggml/pull/224 - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - if (params->ith != 0) { return; } @@ -10670,12 +11287,16 @@ static void ggml_compute_forward_mul_mat( return; } - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const void * x = (char *) src0->data + i03*nb03 + i02*nb02; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + // broadcast src0 into src1 across 2nd,3rd dimension + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { float * const wdata = params->wdata; @@ -10683,7 +11304,7 @@ static void ggml_compute_forward_mul_mat( size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { - to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); + to_float((const char *) x + i01*nb01, wdata + id, ne00); id += ne00; } @@ -10708,7 +11329,7 @@ static void ggml_compute_forward_mul_mat( if (params->type == GGML_TASK_INIT) { if (src1->type != vec_dot_type) { char * wdata = params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { @@ -10728,7 +11349,7 @@ static void ggml_compute_forward_mul_mat( } const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = ne11*ne12*ne13; // src1 rows @@ -10763,10 +11384,6 @@ static void ggml_compute_forward_mul_mat( assert(ne12 % ne02 == 0); assert(ne13 % ne03 == 0); - // broadcast factors - const int64_t r2 = ne12/ne02; - const int64_t r3 = ne13/ne03; - // block-tiling attempt const int64_t blck_0 = 16; const int64_t blck_1 = 16; @@ -11201,7 +11818,7 @@ static void ggml_compute_forward_get_rows_q( assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); - assert(src0->nb[0] == GGML_TYPE_SIZE[type]); + assert(src0->nb[0] == ggml_type_size(type)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; @@ -11502,8 +12119,8 @@ static void ggml_compute_forward_diag_mask_f32( const int ith = params->ith; const int nth = params->nth; - const int n_past = ((int32_t *) dst->op_params)[0]; - const bool inplace = (bool)((int32_t *) dst->op_params)[1]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const bool inplace = src0->data == dst->data; GGML_ASSERT(n_past >= 0); @@ -11714,6 +12331,7 @@ static void ggml_compute_forward_soft_max_back_f32( // dx = J * dy // dxk = sum_i(Jki * dyi) // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk // dxk = sum_i(-yk*yi * dyi) + yk*dyk // dxk = -yk * sum_i(yi * dyi) + yk*dyk // dxk = -yk * dot(y, dy) + yk*dyk @@ -11922,7 +12540,6 @@ static void ggml_compute_forward_alibi( } } - // ggml_compute_forward_clamp static void ggml_compute_forward_clamp_f32( @@ -12011,12 +12628,18 @@ static void ggml_compute_forward_rope_f32( float freq_base; float freq_scale; + // these two only relevant for xPos RoPE: + float xpos_base; + bool xpos_down; + const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); assert(n_past >= 0); @@ -12088,6 +12711,9 @@ static void ggml_compute_forward_rope_f32( for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); + // zeta scaling for xPos only: + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; theta *= theta_scale; @@ -12097,11 +12723,11 @@ static void ggml_compute_forward_rope_f32( const float x0 = src[0]; const float x1 = src[1]; - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[1] = x0*sin_theta + x1*cos_theta; + dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta; + dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; } } else { - // TODO: this is probably wrong, but I can't figure it out .. + // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { @@ -12230,7 +12856,7 @@ static void ggml_compute_forward_rope_f16( dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { - // TODO: this is probably wrong, but I can't figure it out .. + // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { @@ -12292,9 +12918,21 @@ static void ggml_compute_forward_rope_back_f32( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options + float freq_base; + float freq_scale; + + // these two only relevant for xPos RoPE: + float xpos_base; + bool xpos_down; + const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); assert(n_past >= 0); @@ -12320,7 +12958,7 @@ static void ggml_compute_forward_rope_back_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; @@ -12331,12 +12969,15 @@ static void ggml_compute_forward_rope_back_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); + // zeta scaling for xPos only: + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; theta *= theta_scale; @@ -12346,8 +12987,8 @@ static void ggml_compute_forward_rope_back_f32( const float dy0 = dy[0]; const float dy1 = dy[1]; - dx[0] = dy0*cos_theta + dy1*sin_theta; - dx[1] = - dy0*sin_theta + dy1*cos_theta; + dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta; + dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta; } } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { @@ -13040,6 +13681,106 @@ static void ggml_compute_forward_conv_2d( } } +// ggml_compute_forward_conv_transpose_2d + +static void ggml_compute_forward_conv_transpose_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00*ne01*ne02*ne03; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(params->wdata, 0, params->wsize); + + // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); + ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; + } + } + } + } + } + + // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; + for (int i12 = 0; i12 < ne12; i12++) { + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); + ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + const int32_t stride = ggml_get_op_params_i32(dst, 0); + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata_src = wdata + nk; + + for (int i2 = ip0; i2 < ip1; i2++) { // Cout + float * dst_data = (float *)((char *) dst->data + i2*nb2); + ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; + for (int i11 = 0; i11 < ne11; i11++) { + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i11*ne10*ne12 + i10*ne12; + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + ggml_vec_dot_f16(ne03, &v, + wdata_src + i1n, + wdata_kernel + i01*ne00*ne03 + i00*ne03); + dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; + } + } + } + } + } +} + // ggml_compute_forward_pool_1d_sk_p0 static void ggml_compute_forward_pool_1d_sk_p0( @@ -13198,6 +13939,60 @@ static void ggml_compute_forward_pool_2d( ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst); } +// ggml_compute_forward_upscale + +static void ggml_compute_forward_upscale_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + + GGML_TENSOR_UNARY_OP_LOCALS; + + const int scale_factor = dst->op_params[0]; + + // TODO: optimize + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = ith; i02 < ne02; i02++) { + for (int m = 0; m < dst->ne[1]; m++) { + int i01 = m / scale_factor; + for (int n = 0; n < dst->ne[0]; n++) { + int i00 = n / scale_factor; + + const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03); + + float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]); + + *y = *x; + } + } + } + } +} + +static void ggml_compute_forward_upscale( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_upscale_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} // ggml_compute_forward_flash_attn @@ -13327,7 +14122,7 @@ static void ggml_compute_forward_flash_attn_f32( vvexpf(S, S, &Mup); ggml_vec_sum_f32(Mup, &sum, S); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { @@ -13337,9 +14132,13 @@ static void ggml_compute_forward_flash_attn_f32( if (SS[j] == -INFINITY) { SS[j] = 0.0f; } else { +#ifndef GGML_FLASH_ATTN_EXP_FP16 + const float val = expf(SS[j] - max); +#else ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); +#endif sump[j] += (ggml_float)val; SS[j] = val; } @@ -13917,7 +14716,7 @@ static void ggml_compute_forward_flash_attn_back_f32( vvexpf(SM, SM, &Mup); ggml_vec_sum_f32(Mup, &sum, SM); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { @@ -13928,9 +14727,13 @@ static void ggml_compute_forward_flash_attn_back_f32( if (SR[j] == -INFINITY) { SW[j] = 0.0f; } else { +#ifndef GGML_FLASH_ATTN_EXP_FP16 + const float val = expf(SR[j] - max); +#else ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); +#endif sump[j] += (ggml_float)val; SW[j] = val; } @@ -14323,42 +15126,43 @@ static void ggml_compute_forward_unary( } } -// ggml_compute_forward_map_unary +// ggml_compute_forward_get_rel_pos -static void ggml_compute_forward_map_unary_f32( +static void ggml_compute_forward_get_rel_pos_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - struct ggml_tensor * dst, - const ggml_unary_op_f32_t fun) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - + struct ggml_tensor * dst) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); + GGML_TENSOR_UNARY_OP_LOCALS; - for (int i = 0; i < n; i++) { - fun(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); + const int64_t w = ne1; + + ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data; + ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data; + + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + const int64_t pos = (w - i1 - 1) + i2; + for (int64_t i0 = 0; i0 < ne0; ++i0) { + dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0]; + } + } } } - -static void ggml_compute_forward_map_unary( +static void ggml_compute_forward_get_rel_pos( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - struct ggml_tensor * dst, - const ggml_unary_op_f32_t fun) { + struct ggml_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_TYPE_F16: { - ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + ggml_compute_forward_get_rel_pos_f16(params, src0, dst); } break; default: { @@ -14367,38 +15171,168 @@ static void ggml_compute_forward_map_unary( } } -// ggml_compute_forward_map_binary +// ggml_compute_forward_add_rel_pos -static void ggml_compute_forward_map_binary_f32( +static void ggml_compute_forward_add_rel_pos_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst, - const ggml_binary_op_f32_t fun) { - assert(params->ith == 0); - assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { + const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; + if (!inplace && params->type == GGML_TASK_INIT) { + memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); + return; + } if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); - assert(src1->nb[0] == sizeof(float)); + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 - for (int i = 0; i < n; i++) { - fun(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1])), - (float *) ((char *) src1->data + i*(src1->nb[1]))); - } -} + float * src1_data = (float *) src1->data; + float * src2_data = (float *) src2->data; + float * dst_data = (float *) dst->data; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; -static void ggml_compute_forward_map_binary( + const int ith = params->ith; + const int nth = params->nth; + + // total patches in dst + const int np = ne13; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + + for (int64_t i13 = ip0; i13 < ip1; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10; + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t jp0 = jp1 + i10; + const float src1_e = src1_data[jp0]; + const float src2_e = src2_data[jp0]; + + const int64_t jdh = jp0 * ne10; + const int64_t jdw = jdh - (ne10 - 1) * i10; + + for (int64_t j = 0; j < ne10; ++j) { + dst_data[jdh + j ] += src2_e; + dst_data[jdw + j*ne10] += src1_e; + } + } + } + } + } +} + +static void ggml_compute_forward_add_rel_pos( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_unary + +static void ggml_compute_forward_map_unary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + + +static void ggml_compute_forward_map_unary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_binary + +static void ggml_compute_forward_map_binary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + assert(src1->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } +} + + +static void ggml_compute_forward_map_binary( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -14537,6 +15471,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32( const int nc = src0->ne[0]; const int nr = ggml_nrows(src0); + GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); + if (params->type == GGML_TASK_INIT) { if (ith == 0) { memset(sums, 0, sizeof(float) * (nth + nth * nc)); @@ -14548,7 +15484,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( if (ith == 0) { float * dp = (float *) dst->data; ggml_vec_sum_f32(nth, dp, sums); - dp[0] *= -1.0f; + dp[0] *= -1.0f / (float) nr; } return; } @@ -14565,7 +15501,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( for (int i1 = ir0; i1 < ir1; i1++) { float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - float * st = (float *) params->wdata + nth + ith*nc; + float * st = ((float *) params->wdata) + nth + ith*nc; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -14580,15 +15516,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32( float max = -INFINITY; ggml_vec_max_f32(nc, &max, s0); - uint16_t scvt; + uint16_t scvt; UNUSED(scvt); for (int i = 0; i < nc; i++) { if (s0[i] == -INFINITY) { st[i] = 0.0f; } else { - // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); +#ifndef GGML_CROSS_ENTROPY_EXP_FP16 + const float s = s0[i] - max; + const float val = expf(s); +#else ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); +#endif sum += (ggml_float)val; st[i] = val; } @@ -14604,7 +15544,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32( ggml_vec_log_f32(nc, st, st); ggml_vec_mul_f32(nc, st, st, s1); - ggml_vec_sum_f32(nc, sums + ith, st); + float st_sum = 0; + ggml_vec_sum_f32(nc, &st_sum, st); + sums[ith] += st_sum; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -14654,7 +15596,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( return; } - const float eps = 1e-9f; + const double eps = 1e-9; // TODO: handle transposed/permuted matrices const int64_t nc = src0->ne[0]; @@ -14673,7 +15615,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - float * sm = (float *) params->wdata + ith*nc; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -14682,54 +15623,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( assert(!isnan(s1[i])); } #endif - // step by step explanation: - { - //float * sums = (float *) params->wdata; - - // forward pass with annotated gradients from backward pass - // (built by going in reverse operation order, adding to gradients of current operation args) - // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum - // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) - // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps) - // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3] - // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3 - // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1 - // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]] - // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel] - - // substitute into grad[st1], because we can reuse softmax_back from this point on - // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps)) - // postorder: - // grad[st1] := softmax(s0) - // grad[st1] := grad[st1]*(1.0 - eps) - // grad[st1] := grad[st1] + eps - // grad[st1] := s1 / grad[st1] - // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel] - - // src0 gradients by going through softmax_back - // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) - // from softmax_back: - // dxk = yk * (dyk - dot(y, dy)) - // dot_y_dy := dot(y, dy) - // dx := dy - // dx := dx - dot_y_dy - // dx := dx * y - // postorder: - // dot_st1_dst1 := dot(st1, grad[st1]) - // grad[s0] := grad[st1] - // grad[s0] := grad[s0] - dot_st1_dst1 - // grad[s0] := grad[s0] * st1 - - // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1] - // sm := softmax(s0) - // grad[s0] := sm*(1.0 - eps) - // grad[s0] := grad[s0] + eps - // grad[s0] := s1 / grad[s0] - // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel] - // dot_st1_dst1 := dot(sm, grad[s0]) - // grad[s0] := grad[s0] - dot_st1_dst1 - // grad[s0] := grad[s0] * sm - } // soft_max ggml_float sum = 0.0; @@ -14737,39 +15630,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( float max = -INFINITY; ggml_vec_max_f32(nc, &max, s0); - uint16_t scvt; + uint16_t scvt; UNUSED(scvt); for (int i = 0; i < nc; i++) { if (s0[i] == -INFINITY) { - sm[i] = 0.0f; + ds0[i] = 0.0f; } else { - // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); +#ifndef GGML_CROSS_ENTROPY_EXP_FP16 + const float s = s0[i] - max; + const float val = expf(s); +#else ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); +#endif sum += (ggml_float)val; - sm[i] = val; + ds0[i] = val; } } assert(sum > 0.0); - sum = 1.0/sum; + sum = (1.0 - eps)/sum; } - float dot_st1_dst1 = 0; - ggml_vec_scale_f32(nc, sm, sum); - ggml_vec_cpy_f32 (nc, ds0, sm); - ggml_vec_scale_f32(nc, ds0, (1.0f - eps)); - ggml_vec_add1_f32 (nc, ds0, ds0, eps); - ggml_vec_div_f32 (nc, ds0, s1, ds0); - ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]); - ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0); - ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1); - ggml_vec_mul_f32 (nc, ds0, ds0, sm); + // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr + ggml_vec_scale_f32(nc, ds0, sum); + ggml_vec_add1_f32(nc, ds0, ds0, eps); + ggml_vec_sub_f32(nc, ds0, ds0, s1); + ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); + #ifndef NDEBUG for (int i = 0; i < nc; ++i) { - assert(!isnan(sm[i])); - assert(!isinf(sm[i])); assert(!isnan(ds0[i])); assert(!isinf(ds0[i])); } @@ -14875,6 +15766,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); } break; + case GGML_OP_CONCAT: + { + ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor); + } break; case GGML_OP_SILU_BACK: { ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); @@ -14891,6 +15786,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); } break; + case GGML_OP_GROUP_NORM: + { + ggml_compute_forward_group_norm(params, tensor->src[0], tensor); + } break; case GGML_OP_MUL_MAT: { ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); @@ -14983,6 +15882,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); + } break; case GGML_OP_POOL_1D: { ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); @@ -14991,6 +15894,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); } break; + case GGML_OP_UPSCALE: + { + ggml_compute_forward_upscale(params, tensor->src[0], tensor); + } break; case GGML_OP_FLASH_ATTN: { const int32_t t = ggml_get_op_params_i32(tensor, 0); @@ -15021,6 +15928,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_unary(params, tensor->src[0], tensor); } break; + case GGML_OP_GET_REL_POS: + { + ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor); + } break; + case GGML_OP_ADD_REL_POS: + { + ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; @@ -15284,6 +16199,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor inplace); } } break; + case GGML_OP_CONCAT: + { + GGML_ASSERT(false); // TODO: implement + } break; case GGML_OP_SILU_BACK: { GGML_ASSERT(false); // TODO: not implemented @@ -15296,9 +16215,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { + float eps; + memcpy(&eps, tensor->op_params, sizeof(float)); + src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_rms_norm_back(ctx, src0, tensor->grad), + ggml_rms_norm_back(ctx, src0, tensor->grad, eps), inplace); } } break; @@ -15306,6 +16228,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_GROUP_NORM: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_MUL_MAT: { // https://cs231n.github.io/optimization-2/#staged @@ -15580,6 +16506,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; const int n_ctx = ((int32_t *) tensor->op_params)[3]; + float freq_base; + float freq_scale; + float xpos_base; + bool xpos_down; + memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + src0->grad = ggml_add_impl(ctx, src0->grad, ggml_rope_back(ctx, @@ -15587,7 +16522,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor n_past, n_dims, mode, - n_ctx), + n_ctx, + freq_base, + freq_scale, + xpos_base, + xpos_down), inplace); } } break; @@ -15598,14 +16537,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; const int n_ctx = ((int32_t *) tensor->op_params)[3]; + float freq_base; + float freq_scale; + float xpos_base; + bool xpos_down; + memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_rope(ctx, + ggml_rope_impl(ctx, tensor->grad, n_past, n_dims, mode, - n_ctx), + n_ctx, + freq_base, + freq_scale, + xpos_base, + xpos_down, + false), inplace); } } break; @@ -15625,6 +16578,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_POOL_1D: { GGML_ASSERT(false); // TODO: not implemented @@ -15633,6 +16590,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_UPSCALE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_FLASH_ATTN: { struct ggml_tensor * flash_grad = NULL; @@ -15874,6 +16835,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor GGML_ASSERT(false); } } break; + case GGML_OP_GET_REL_POS: + case GGML_OP_ADD_REL_POS: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: @@ -16025,9 +16988,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { return result; } -struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { - struct ggml_cgraph result = *gf; - +void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) { GGML_ASSERT(gf->n_nodes > 0); // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph @@ -16051,15 +17012,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg } } - for (int i = gf->n_nodes - 1; i >= 0; i--) { + for (int i = 0; i < gf->n_nodes; i++) { struct ggml_tensor * node = gf->nodes[i]; if (node->is_param) { GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); - ggml_build_forward_expand(&result, node->grad); + ggml_build_forward_expand(gb, node->grad); } } +} +struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { + struct ggml_cgraph result = *gf; + ggml_build_backward_expand(ctx, gf, &result, keep); return result; } @@ -16382,7 +17347,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; + cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -16395,7 +17360,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; if (ggml_is_quantized(node->src[0]->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks; + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -16407,7 +17372,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; if (ggml_is_quantized(node->src[0]->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks; + cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -16454,9 +17419,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: + case GGML_OP_GROUP_NORM: { n_tasks = n_threads; } break; + case GGML_OP_CONCAT: case GGML_OP_MUL_MAT: case GGML_OP_OUT_PROD: { @@ -16490,12 +17457,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { // the threads are still spinning if (node->src[0]->type != GGML_TYPE_F32) { // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]); + cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); } } else #endif if (node->src[1]->type != vec_dot_type) { - cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type]; + cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); } else { cur = 0; } @@ -16524,6 +17491,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: + case GGML_OP_ADD_REL_POS: { n_tasks = n_threads; } break; @@ -16598,6 +17566,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { GGML_ASSERT(false); } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + n_tasks = n_threads; + + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // Channels Out + const int64_t ne03 = node->src[0]->ne[3]; // Channels In + + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // Channels In + + size_t cur = 0; + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; + cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; + work_size = MAX(work_size, cur); } break; case GGML_OP_POOL_1D: @@ -16605,6 +17592,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { { n_tasks = 1; } break; + case GGML_OP_UPSCALE: + { + n_tasks = n_threads; + } break; case GGML_OP_FLASH_ATTN: { n_tasks = n_threads; @@ -16666,6 +17657,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: + case GGML_OP_GET_REL_POS: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: @@ -16712,10 +17704,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks; - - work_size = MAX(work_size, cur); } break; case GGML_OP_NONE: { @@ -16783,8 +17771,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); GGML_ASSERT(rc == 0); + UNUSED(rc); } } + workers[0].ith = 0; workers[0].shared = &state_shared; @@ -16900,7 +17890,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { // compute size of intermediate results // TODO: does not take into account scratch buffers !!!! for (int i = 0; i < cgraph->n_nodes; ++i) { - size_eval += ggml_nbytes(cgraph->nodes[i]); + size_eval += ggml_nbytes_pad(cgraph->nodes[i]); } // print @@ -17358,10 +18348,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * node = cgraph->leafs[i]; - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n", i, node->ne[0], node->ne[1], - ggml_op_name(node->op)); + ggml_op_name(node->op), + ggml_get_name(node)); } for (int i = 0; i < GGML_OP_COUNT; i++) { @@ -17591,14 +18582,16 @@ static enum ggml_opt_result ggml_opt_adam( struct ggml_opt_params params, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb) { + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { GGML_ASSERT(ggml_is_scalar(f)); // these will store the parameters we want to optimize struct ggml_tensor * ps[GGML_MAX_PARAMS]; int np = 0; - int nx = 0; + int64_t nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { if (gf->nodes[i]->is_param) { GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); @@ -17617,31 +18610,32 @@ static enum ggml_opt_result ggml_opt_adam( } // constants - const float sched = params.adam.sched; - const float decay = params.adam.decay * sched; - const float alpha = params.adam.alpha * sched; + float sched = params.adam.sched; + const float alpha = params.adam.alpha; + const float decay = params.adam.decay * alpha; const float beta1 = params.adam.beta1; const float beta2 = params.adam.beta2; const float eps = params.adam.eps; + const float gclip = params.adam.gclip; + const int decay_min_ndim = params.adam.decay_min_ndim; - float * x = opt->adam.x->data; // view of the parameters - float * g1 = opt->adam.g1->data; // gradient - float * g2 = opt->adam.g2->data; // gradient squared float * m = opt->adam.m->data; // first moment float * v = opt->adam.v->data; // second moment - float * mh = opt->adam.mh->data; // first moment hat - float * vh = opt->adam.vh->data; // second moment hat float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - // update view - ggml_opt_get_params(np, ps, x); + if (callback) { + callback(callback_data, &sched); + } // compute the function value ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + ggml_graph_compute(gb, &cplan); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -17649,6 +18643,9 @@ static enum ggml_opt_result ggml_opt_adam( pf[opt->iter % params.past] = opt->adam.fx_prev; } + opt->loss_before = opt->adam.fx_prev; + opt->loss_after = opt->adam.fx_prev; + // initialize if (opt->just_initialized) { opt->adam.n_no_improvement = 0; @@ -17681,50 +18678,55 @@ static enum ggml_opt_result ggml_opt_adam( UNUSED(t_start_cpu); { - // update the gradient - ggml_opt_get_grad(np, ps, g1); - - // m_t = beta1*m_t-1 + (1 - beta1)*g_t - ggml_vec_scale_f32(nx, m, beta1); - ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1); - - // g2 = g1^2 - ggml_vec_sqr_f32 (nx, g2, g1); - - // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2 - ggml_vec_scale_f32(nx, v, beta2); - ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2); - - // m^hat = m_t / (1 - beta1^t) - // v^hat = v_t / (1 - beta2^t) - // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1) - // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1 - // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps) - // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps) - // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay) - ggml_vec_cpy_f32 (nx, mh, m); - ggml_vec_cpy_f32 (nx, vh, v); - - ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter))); - ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter))); - - ggml_vec_sqrt_f32 (nx, vh, vh); - ggml_vec_acc1_f32 (nx, vh, eps); - - ggml_vec_div_f32 (nx, mh, mh, vh); - ggml_vec_scale_f32(nx, x, 1.0f - decay); - ggml_vec_sub_f32 (nx, x, x, mh); + float gnorm = 1.0f; + if (gclip > 0.0f) { + // gradient clipping + ggml_float sum = 0.0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]); + for (int64_t j = 0; j < ne; ++j) { + float g = ggml_get_f32_1d(ps[p]->grad, j); + sum += (ggml_float)(g*g); + } + } + ggml_float norm = sqrt(sum); + if (norm > (ggml_float) gclip) { + gnorm = (float) ((ggml_float) gclip / norm); + } + } + const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter)); + const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter)); + int64_t i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]); + const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; + for (int64_t j = 0; j < ne; ++j) { + float x = ggml_get_f32_1d(ps[p], j); + float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm; + m[i] = m[i]*beta1 + g*(1.0f - beta1); + v[i] = v[i]*beta2 + g*g*(1.0f - beta2); + float mh = m[i]*beta1h; + float vh = v[i]*beta2h; + vh = sqrtf(vh) + eps; + x = x*(1.0f - p_decay) - mh/vh; + ggml_set_f32_1d(ps[p], j, x); + ++i; + } + } + } - // update the parameters - ggml_opt_set_params(np, ps, x); + if (callback) { + callback(callback_data, &sched); } ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); + ggml_graph_compute(gb, &cplan); const float fx = ggml_get_f32_1d(f, 0); + opt->loss_after = fx; + // check convergence if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { @@ -17793,7 +18795,6 @@ struct ggml_lbfgs_iteration_data { }; static enum ggml_opt_result linesearch_backtracking( - struct ggml_context * ctx, const struct ggml_opt_params * params, int nx, float * x, @@ -17805,8 +18806,11 @@ static enum ggml_opt_result linesearch_backtracking( struct ggml_tensor * f, struct ggml_cgraph * gf, struct ggml_cgraph * gb, + struct ggml_cplan * cplan, const int np, - struct ggml_tensor * ps[]) { + struct ggml_tensor * ps[], + ggml_opt_callback callback, + void * callback_data) { int count = 0; float width = 0.0f; @@ -17835,6 +18839,12 @@ static enum ggml_opt_result linesearch_backtracking( dgtest = params->lbfgs.ftol*dginit; while (true) { + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, &sched); + } + ggml_vec_cpy_f32(nx, x, xp); ggml_vec_mad_f32(nx, x, d, *step); @@ -17845,7 +18855,7 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx, gb, params->n_threads); + ggml_graph_compute(gb, cplan); ggml_opt_get_grad(np, ps, g); @@ -17879,7 +18889,6 @@ static enum ggml_opt_result linesearch_backtracking( // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) return count; } - return count; } } @@ -17905,7 +18914,9 @@ static enum ggml_opt_result ggml_opt_lbfgs( struct ggml_opt_params params, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb) { + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { @@ -17937,6 +18948,10 @@ static enum ggml_opt_result ggml_opt_lbfgs( opt->iter = iter; } + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + float * x = opt->lbfgs.x->data; // current parameters float * xp = opt->lbfgs.xp->data; // previous parameters float * g = opt->lbfgs.g->data; // current gradient @@ -17958,6 +18973,12 @@ static enum ggml_opt_result ggml_opt_lbfgs( float * lm_s = opt->lbfgs.lms->data; float * lm_y = opt->lbfgs.lmy->data; + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, &sched); + } + // evaluate the function value and its gradient { ggml_opt_set_params(np, ps, x); @@ -17965,11 +18986,14 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); + ggml_graph_compute(gb, &cplan); ggml_opt_get_grad(np, ps, g); fx = ggml_get_f32_1d(f, 0); + + opt->loss_before = fx; + opt->loss_after = fx; } // search direction = -gradient @@ -18024,7 +19048,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_vec_cpy_f32(nx, xp, x); ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data); if (ls < 0) { // linesearch failed - go back to the previous point and return @@ -18034,6 +19058,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( return ls; } + opt->loss_after = fx; + ggml_vec_norm_f32(nx, &xnorm, x); ggml_vec_norm_f32(nx, &gnorm, g); @@ -18091,7 +19117,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]); + ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]); ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); lm_ys[end[0]] = ys; @@ -18154,13 +19180,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { .adam = { .n_iter = 10000, .sched = 1.000f, - .decay = 0.001f, + .decay = 0.0f, + .decay_min_ndim = 2, .alpha = 0.001f, .beta1 = 0.9f, .beta2 = 0.999f, .eps = 1e-8f, .eps_f = 1e-5f, .eps_g = 1e-3f, + .gclip = 0.0f, }, }; } break; @@ -18210,23 +19238,13 @@ GGML_API void ggml_opt_init( switch (opt->params.type) { case GGML_OPT_ADAM: { - opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); opt->adam.pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) : NULL; - ggml_set_zero(opt->adam.x); - ggml_set_zero(opt->adam.g1); - ggml_set_zero(opt->adam.g2); ggml_set_zero(opt->adam.m); ggml_set_zero(opt->adam.v); - ggml_set_zero(opt->adam.mh); - ggml_set_zero(opt->adam.vh); if (opt->adam.pf) { ggml_set_zero(opt->adam.pf); } @@ -18301,8 +19319,8 @@ enum ggml_opt_result ggml_opt_resume( struct ggml_tensor * f) { // build forward + backward compute graphs - struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); - struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); + struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; @@ -18310,7 +19328,7 @@ enum ggml_opt_result ggml_opt_resume( *gf = ggml_build_forward (f); *gb = ggml_build_backward(ctx, gf, true); - return ggml_opt_resume_g(ctx, opt, f, gf, gb); + return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); } enum ggml_opt_result ggml_opt_resume_g( @@ -18318,7 +19336,9 @@ enum ggml_opt_result ggml_opt_resume_g( struct ggml_opt_context * opt, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb) { + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { // build forward + backward compute graphs enum ggml_opt_result result = GGML_OPT_OK; @@ -18326,11 +19346,11 @@ enum ggml_opt_result ggml_opt_resume_g( switch (opt->params.type) { case GGML_OPT_ADAM: { - result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb); + result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; case GGML_OPT_LBFGS: { - result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb); + result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; } @@ -18561,47 +19581,1147 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i //////////////////////////////////////////////////////////////////////////////// -int ggml_cpu_has_avx(void) { -#if defined(__AVX__) - return 1; -#else - return 0; -#endif -} +struct gguf_str { + uint64_t n; // GGUFv2 + char * data; +}; -int ggml_cpu_has_avx2(void) { -#if defined(__AVX2__) - return 1; -#else - return 0; -#endif -} +static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { + [GGUF_TYPE_UINT8] = sizeof(uint8_t), + [GGUF_TYPE_INT8] = sizeof(int8_t), + [GGUF_TYPE_UINT16] = sizeof(uint16_t), + [GGUF_TYPE_INT16] = sizeof(int16_t), + [GGUF_TYPE_UINT32] = sizeof(uint32_t), + [GGUF_TYPE_INT32] = sizeof(int32_t), + [GGUF_TYPE_FLOAT32] = sizeof(float), + [GGUF_TYPE_BOOL] = sizeof(bool), + [GGUF_TYPE_STRING] = sizeof(struct gguf_str), + [GGUF_TYPE_UINT64] = sizeof(uint64_t), + [GGUF_TYPE_INT64] = sizeof(int64_t), + [GGUF_TYPE_FLOAT64] = sizeof(double), + [GGUF_TYPE_ARRAY] = 0, // undefined +}; +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); + +static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { + [GGUF_TYPE_UINT8] = "u8", + [GGUF_TYPE_INT8] = "i8", + [GGUF_TYPE_UINT16] = "u16", + [GGUF_TYPE_INT16] = "i16", + [GGUF_TYPE_UINT32] = "u32", + [GGUF_TYPE_INT32] = "i32", + [GGUF_TYPE_FLOAT32] = "f32", + [GGUF_TYPE_BOOL] = "bool", + [GGUF_TYPE_STRING] = "str", + [GGUF_TYPE_ARRAY] = "arr", + [GGUF_TYPE_UINT64] = "u64", + [GGUF_TYPE_INT64] = "i64", + [GGUF_TYPE_FLOAT64] = "f64", +}; +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); + +union gguf_value { + uint8_t uint8; + int8_t int8; + uint16_t uint16; + int16_t int16; + uint32_t uint32; + int32_t int32; + float float32; + uint64_t uint64; + int64_t int64; + double float64; + bool bool_; + + struct gguf_str str; + + struct { + enum gguf_type type; + + uint64_t n; // GGUFv2 + void * data; + } arr; +}; -int ggml_cpu_has_avx512(void) { -#if defined(__AVX512F__) - return 1; -#else - return 0; -#endif -} +struct gguf_kv { + struct gguf_str key; -int ggml_cpu_has_avx512_vbmi(void) { -#if defined(__AVX512VBMI__) - return 1; -#else - return 0; -#endif -} + enum gguf_type type; + union gguf_value value; +}; -int ggml_cpu_has_avx512_vnni(void) { -#if defined(__AVX512VNNI__) - return 1; -#else - return 0; -#endif -} +struct gguf_header { + uint32_t magic; + uint32_t version; + uint64_t n_tensors; // GGUFv2 + uint64_t n_kv; // GGUFv2 +}; -int ggml_cpu_has_fma(void) { +struct gguf_tensor_info { + struct gguf_str name; + + uint32_t n_dims; + uint64_t ne[GGML_MAX_DIMS]; + + enum ggml_type type; + + uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` + + // for writing API + const void * data; + size_t size; +}; + +struct gguf_context { + struct gguf_header header; + + struct gguf_kv * kv; + struct gguf_tensor_info * infos; + + size_t alignment; + size_t offset; // offset of `data` from beginning of file + size_t size; // size of `data` in bytes + + //uint8_t * padding; + void * data; +}; + +static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { + const size_t n = fread(dst, 1, size, file); + *offset += n; + return n == size; +} + +// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 +static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; +} + +static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n; + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; +} + +struct gguf_context * gguf_init_empty(void) { + struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + + ctx->header.magic = GGUF_MAGIC; + ctx->header.version = GGUF_VERSION; + ctx->header.n_tensors = 0; + ctx->header.n_kv = 0; + + ctx->kv = NULL; + ctx->infos = NULL; + + ctx->alignment = GGUF_DEFAULT_ALIGNMENT; + ctx->offset = 0; + ctx->size = 0; + + ctx->data = NULL; + + return ctx; +} + +struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { + FILE * file = fopen(fname, "rb"); + if (!file) { + return NULL; + } + + // offset from start of file + size_t offset = 0; + + uint32_t magic = 0; + + // check the magic before making allocations + { + gguf_fread_el(file, &magic, sizeof(magic), &offset); + + if (magic != GGUF_MAGIC) { + fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic); + fclose(file); + return NULL; + } + } + + bool ok = true; + + struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + + // read the header + { + ctx->header.magic = magic; + + ctx->kv = NULL; + ctx->infos = NULL; + ctx->data = NULL; + + ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n_tensors = 0; + uint32_t n_kv = 0; + + ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset); + ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset); + + ctx->header.n_tensors = n_tensors; + ctx->header.n_kv = n_kv; + } else { + ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + } + + if (!ok) { + fprintf(stderr, "%s: failed to read header\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur; + if (ctx->header.version == 1) { + gguf_fread_str = gguf_fread_str_v1; + } + + // read the kv pairs + { + ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); + + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + //fprintf(stderr, "%s: reading kv %d\n", __func__, i); + + ok = ok && gguf_fread_str(file, &kv->key, &offset); + ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); + + //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); + + switch (kv->type) { + case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break; + case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break; + case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break; + case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break; + case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; + case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break; + case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break; + case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break; + case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break; + case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break; + case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; + case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break; + case GGUF_TYPE_ARRAY: + { + ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset); + kv->value.arr.n = n; + } else { + ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + } + + switch (kv->value.arr.type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + case GGUF_TYPE_BOOL: + { + kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset); + } break; + case GGUF_TYPE_STRING: + { + kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str)); + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); + } + } break; + case GGUF_TYPE_ARRAY: + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + }; + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + }; + + if (!ok) { + break; + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read key-value pairs\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + + // read the tensor infos + { + ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + info->ne[j] = 1; + } + + ok = ok && gguf_fread_str(file, &info->name, &offset); + ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); + for (uint32_t j = 0; j < info->n_dims; ++j) { + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t t = 0; + ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset); + info->ne[j] = t; + } else { + ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + } + } + ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); + ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor info\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + } + + ctx->alignment = GGUF_DEFAULT_ALIGNMENT; + + int alignment_idx = gguf_find_key(ctx, "general.alignment"); + if (alignment_idx != -1) { + ctx->alignment = gguf_get_val_u32(ctx, alignment_idx); + } + + // we require the data section to be aligned, so take into account any padding + { + const size_t offset_pad = offset % ctx->alignment; + + if (offset_pad != 0) { + offset += ctx->alignment - offset_pad; + fseek(file, offset, SEEK_SET); + } + } + + // store the current file offset - this is where the data section starts + ctx->offset = offset; + + // compute the total size of the data section, taking into account the alignment + { + ctx->size = 0; + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + const int64_t ne = + (int64_t) info->ne[0] * + (int64_t) info->ne[1] * + (int64_t) info->ne[2] * + (int64_t) info->ne[3]; + + if (ne % ggml_blck_size(info->type) != 0) { + fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", + __func__, info->name.data, ne, ggml_blck_size(info->type)); + fclose(file); + gguf_free(ctx); + return NULL; + } + + const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); + + ctx->size += GGML_PAD(size_cur, ctx->alignment); + } + } + + // load the tensor data only if requested + if (params.ctx != NULL) { + // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob + // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of + // the ggml_tensor structs to the appropriate locations in the binary blob + + // compute the exact size needed for the new ggml_context + const size_t mem_size = + params.no_alloc ? + (ctx->header.n_tensors )*ggml_tensor_overhead() : + (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size; + + struct ggml_init_params pdata = { + .mem_size = mem_size, + .mem_buffer = NULL, + .no_alloc = params.no_alloc, + }; + + *params.ctx = ggml_init(pdata); + + struct ggml_context * ctx_data = *params.ctx; + + struct ggml_tensor * data = NULL; + + if (!params.no_alloc) { + data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); + + ok = ok && data != NULL; + + // read the binary blob with the tensor data + ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor data\n", __func__); + fclose(file); + ggml_free(ctx_data); + gguf_free(ctx); + return NULL; + } + + ctx->data = data->data; + } + + ggml_set_no_alloc(ctx_data, true); + + // create the tensors + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + const int64_t ne[GGML_MAX_DIMS] = { + ctx->infos[i].ne[0], + ctx->infos[i].ne[1], + ctx->infos[i].ne[2], + ctx->infos[i].ne[3], + }; + + struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); + + ok = ok && cur != NULL; + + ggml_set_name(cur, ctx->infos[i].name.data); + + if (!ok) { + break; + } + + // point the data member to the appropriate location in the binary blob using the tensor infos + if (!params.no_alloc) { + //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file + cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read the tensor data\n", __func__); + fclose(file); + ggml_free(ctx_data); + gguf_free(ctx); + return NULL; + } + + ggml_set_no_alloc(ctx_data, params.no_alloc); + } + + fclose(file); + + return ctx; +} + +void gguf_free(struct gguf_context * ctx) { + if (ctx == NULL) { + return; + } + + if (ctx->kv) { + // free string memory - not great.. + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + if (kv->key.data) { + free(kv->key.data); + } + + if (kv->type == GGUF_TYPE_STRING) { + if (kv->value.str.data) { + free(kv->value.str.data); + } + } + + if (kv->type == GGUF_TYPE_ARRAY) { + if (kv->value.arr.data) { + if (kv->value.arr.type == GGUF_TYPE_STRING) { + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; + if (str->data) { + free(str->data); + } + } + } + free(kv->value.arr.data); + } + } + } + + free(ctx->kv); + } + + if (ctx->infos) { + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + if (info->name.data) { + free(info->name.data); + } + } + + free(ctx->infos); + } + + GGML_ALIGNED_FREE(ctx); +} + +const char * gguf_type_name(enum gguf_type type) { + return GGUF_TYPE_NAME[type]; +} + +int gguf_get_version(const struct gguf_context * ctx) { + return ctx->header.version; +} + +size_t gguf_get_alignment(const struct gguf_context * ctx) { + return ctx->alignment; +} + +size_t gguf_get_data_offset(const struct gguf_context * ctx) { + return ctx->offset; +} + +void * gguf_get_data(const struct gguf_context * ctx) { + return ctx->data; +} + +int gguf_get_n_kv(const struct gguf_context * ctx) { + return ctx->header.n_kv; +} + +int gguf_find_key(const struct gguf_context * ctx, const char * key) { + // return -1 if key not found + int keyfound = -1; + + const int n_kv = gguf_get_n_kv(ctx); + + for (int i = 0; i < n_kv; ++i) { + if (strcmp(key, gguf_get_key(ctx, i)) == 0) { + keyfound = i; + break; + } + } + + return keyfound; +} + +const char * gguf_get_key(const struct gguf_context * ctx, int i) { + return ctx->kv[i].key.data; +} + +enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) { + return ctx->kv[i].type; +} + +enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.type; +} + +const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.data; +} + +const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) { + struct gguf_kv * kv = &ctx->kv[key_id]; + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; + return str->data; +} + +int gguf_get_arr_n(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.n; +} + +uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint8; +} + +int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int8; +} + +uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint16; +} + +int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int16; +} + +uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint32; +} + +int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int32; +} + +float gguf_get_val_f32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float32; +} + +uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint64; +} + +int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int64; +} + +double gguf_get_val_f64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float64; +} + +bool gguf_get_val_bool(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.bool_; +} + +const char * gguf_get_val_str (const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.str.data; +} + +int gguf_get_n_tensors(const struct gguf_context * ctx) { + return ctx->header.n_tensors; +} + +int gguf_find_tensor(const struct gguf_context * ctx, const char * name) { + // return -1 if tensor not found + int tensorfound = -1; + + const int n_tensors = gguf_get_n_tensors(ctx); + + for (int i = 0; i < n_tensors; ++i) { + if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) { + tensorfound = i; + break; + } + } + + return tensorfound; +} + +size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) { + return ctx->infos[i].offset; +} + +char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { + return ctx->infos[i].name.data; +} + +// returns the index +static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { + const int idx = gguf_find_key(ctx, key); + if (idx >= 0) { + return idx; + } + + const int n_kv = gguf_get_n_kv(ctx); + + ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); + ctx->kv[n_kv].key.n = strlen(key); + ctx->kv[n_kv].key.data = strdup(key); + ctx->header.n_kv++; + + return n_kv; +} + +void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT8; + ctx->kv[idx].value.uint8 = val; +} + +void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT8; + ctx->kv[idx].value.int8 = val; +} + +void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT16; + ctx->kv[idx].value.uint16 = val; +} + +void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT16; + ctx->kv[idx].value.int16 = val; +} + +void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT32; + ctx->kv[idx].value.uint32 = val; +} + +void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT32; + ctx->kv[idx].value.int32 = val; +} + +void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT32; + ctx->kv[idx].value.float32 = val; +} + +void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT64; + ctx->kv[idx].value.uint64 = val; +} + +void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT64; + ctx->kv[idx].value.int64 = val; +} + +void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT64; + ctx->kv[idx].value.float64 = val; +} + +void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_BOOL; + ctx->kv[idx].value.bool_ = val; +} + +void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_STRING; + ctx->kv[idx].value.str.n = strlen(val); + ctx->kv[idx].value.str.data = strdup(val); +} + +void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_ARRAY; + ctx->kv[idx].value.arr.type = type; + ctx->kv[idx].value.arr.n = n; + ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]); + memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]); +} + +void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_ARRAY; + ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; + ctx->kv[idx].value.arr.n = n; + ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str)); + for (int i = 0; i < n; i++) { + struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; + str->n = strlen(data[i]); + str->data = strdup(data[i]); + } +} + +// set or add KV pairs from another context +void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { + for (uint32_t i = 0; i < src->header.n_kv; i++) { + switch (src->kv[i].type) { + case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break; + case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break; + case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break; + case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break; + case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break; + case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break; + case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break; + case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break; + case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break; + case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break; + case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break; + case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break; + case GGUF_TYPE_ARRAY: + { + if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { + const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { + data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; + } + gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); + free(data); + } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { + GGML_ASSERT(false && "nested arrays not supported"); + } else { + gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); + } + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + } + } +} + +void gguf_add_tensor( + struct gguf_context * ctx, + const struct ggml_tensor * tensor) { + const int idx = ctx->header.n_tensors; + ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); + + ctx->infos[idx].name.n = strlen(tensor->name); + ctx->infos[idx].name.data = strdup(tensor->name); + + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + ctx->infos[idx].ne[i] = 1; + } + + ctx->infos[idx].n_dims = tensor->n_dims; + for (int i = 0; i < tensor->n_dims; i++) { + ctx->infos[idx].ne[i] = tensor->ne[i]; + } + + ctx->infos[idx].type = tensor->type; + ctx->infos[idx].offset = 0; + ctx->infos[idx].data = tensor->data; + ctx->infos[idx].size = ggml_nbytes(tensor); + + if (ctx->header.n_tensors > 0) { + ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment); + } + + ctx->header.n_tensors++; +} + +void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].type = type; +} + +void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].data = data; + ctx->infos[idx].size = size; + + // update offsets + for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) { + ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment); + } +} + +//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { +// fwrite(&val->n, sizeof(val->n), 1, file); +// fwrite(val->data, sizeof(char), val->n, file); +//} +// +//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) { +// fwrite(val, sizeof(char), size, file); +//} + +struct gguf_buf { + void * data; + size_t size; + size_t offset; +}; + +static struct gguf_buf gguf_buf_init(size_t size) { + struct gguf_buf buf = { + /*buf.data =*/ size == 0 ? NULL : malloc(size), + /*buf.size =*/ size, + /*buf.offset =*/ 0, + }; + + return buf; +} + +static void gguf_buf_free(struct gguf_buf buf) { + if (buf.data) { + free(buf.data); + } +} + +static void gguf_buf_grow(struct gguf_buf * buf, size_t size) { + if (buf->offset + size > buf->size) { + buf->size = 1.5*(buf->offset + size); + if (buf->data) { + buf->data = realloc(buf->data, buf->size); + } + } +} + +static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) { + gguf_buf_grow(buf, sizeof(val->n) + val->n); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n)); + } + buf->offset += sizeof(val->n); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val->data, val->n); + } + buf->offset += val->n; +} + +static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) { + gguf_buf_grow(buf, el_size); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val, el_size); + } + buf->offset += el_size; +} + +static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { + // write header + gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic)); + gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version)); + gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors)); + gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv)); + + // write key-value pairs + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + gguf_bwrite_str(buf, &kv->key); + gguf_bwrite_el (buf, &kv->type, sizeof(kv->type)); + + switch (kv->type) { + case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break; + case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break; + case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break; + case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break; + case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break; + case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break; + case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break; + case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break; + case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break; + case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break; + case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break; + case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break; + case GGUF_TYPE_ARRAY: + { + gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type)); + gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) ); + + switch (kv->value.arr.type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + case GGUF_TYPE_BOOL: + { + gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + } break; + case GGUF_TYPE_STRING: + { + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]); + } + } break; + case GGUF_TYPE_ARRAY: + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + }; + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + }; + } + + // write tensor infos + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + gguf_bwrite_str(buf, &info->name); + gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims)); + for (uint32_t j = 0; j < info->n_dims; ++j) { + gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j])); + } + gguf_bwrite_el(buf, &info->type, sizeof(info->type)); + gguf_bwrite_el(buf, &info->offset, sizeof(info->offset)); + } + + // we require the data section to be aligned, so take into account any padding + { + const size_t offset = buf->offset; + const size_t offset_pad = GGML_PAD(offset, ctx->alignment); + + if (offset_pad != offset) { + uint8_t pad = 0; + for (size_t i = 0; i < offset_pad - offset; ++i) { + gguf_bwrite_el(buf, &pad, sizeof(pad)); + } + } + } + + if (only_meta) { + return; + } + + size_t offset = 0; + + // write tensor data + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + const size_t size = info->size; + const size_t size_pad = GGML_PAD(size, ctx->alignment); + + gguf_bwrite_el(buf, info->data, size); + + if (size_pad != size) { + uint8_t pad = 0; + for (size_t j = 0; j < size_pad - size; ++j) { + gguf_bwrite_el(buf, &pad, sizeof(pad)); + } + } + + GGML_ASSERT(offset == info->offset); + + offset += size_pad; + } +} + +void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { + FILE * file = fopen(fname, "wb"); + if (!file) { + GGML_ASSERT(false && "failed to open file for writing"); + } + + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, only_meta); + + fwrite(buf.data, 1, buf.offset, file); + + gguf_buf_free(buf); + + fclose(file); +} + +size_t gguf_get_meta_size(const struct gguf_context * ctx) { + // no allocs - only compute size + struct gguf_buf buf = gguf_buf_init(0); + + gguf_write_to_buf(ctx, &buf, true); + + return buf.offset; +} + +void gguf_get_meta_data(const struct gguf_context * ctx, void * data) { + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, true); + + memcpy(data, buf.data, buf.offset); + + gguf_buf_free(buf); +} + +//////////////////////////////////////////////////////////////////////////////// + +int ggml_cpu_has_avx(void) { +#if defined(__AVX__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx2(void) { +#if defined(__AVX2__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512(void) { +#if defined(__AVX512F__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vbmi(void) { +#if defined(__AVX512VBMI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vnni(void) { +#if defined(__AVX512VNNI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_fma(void) { #if defined(__FMA__) return 1; #else @@ -18625,6 +20745,14 @@ int ggml_cpu_has_arm_fma(void) { #endif } +int ggml_cpu_has_metal(void) { +#if defined(GGML_USE_METAL) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_f16c(void) { #if defined(__F16C__) return 1; @@ -18685,6 +20813,14 @@ int ggml_cpu_has_sse3(void) { #endif } +int ggml_cpu_has_ssse3(void) { +#if defined(__SSSE3__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_vsx(void) { #if defined(__POWER9_VECTOR__) return 1; diff --git a/ggml.h b/ggml.h index bdbd12800433242dec1e6dac8c96875540dd19e7..f45456876da621d5beafdc2419d9d575a9fb02ff 100644 --- a/ggml.h +++ b/ggml.h @@ -130,13 +130,16 @@ // The data of the tensor is accessed via the "data" pointer. For example: // // { -// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); +// const int nx = 2; +// const int ny = 3; // -// // a[2, 1] = 1.0f; -// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; +// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); // -// // a[0, 2] = 2.0f; -// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y; +// } +// } // // ... // } @@ -192,6 +195,14 @@ # define GGML_DEPRECATED(func, hint) func #endif +#ifndef __GNUC__ +# define GGML_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif + #include #include #include @@ -207,14 +218,24 @@ #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 -#define GGML_MAX_NAME 48 +#define GGML_MAX_NAME 64 #define GGML_MAX_OP_PARAMS 32 #define GGML_DEFAULT_N_THREADS 4 +#if UINTPTR_MAX == 0xFFFFFFFF + #define GGML_MEM_ALIGN 4 +#else + #define GGML_MEM_ALIGN 16 +#endif #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 +#define GGUF_MAGIC 0x46554747 // "GGUF" +#define GGUF_VERSION 2 + +#define GGUF_DEFAULT_ALIGNMENT 32 + #define GGML_UNUSED(x) (void)(x) #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) @@ -255,8 +276,9 @@ extern "C" { #endif -#ifdef __ARM_NEON - // we use the built-in 16-bit float type +#if defined(__ARM_NEON) && defined(__CUDACC__) + typedef half ggml_fp16_t; +#elif defined(__ARM_NEON) typedef __fp16 ggml_fp16_t; #else typedef uint16_t ggml_fp16_t; @@ -340,10 +362,12 @@ extern "C" { GGML_OP_ARGMAX, GGML_OP_REPEAT, GGML_OP_REPEAT_BACK, + GGML_OP_CONCAT, GGML_OP_SILU_BACK, GGML_OP_NORM, // normalize GGML_OP_RMS_NORM, GGML_OP_RMS_NORM_BACK, + GGML_OP_GROUP_NORM, GGML_OP_MUL_MAT, GGML_OP_OUT_PROD, @@ -369,14 +393,19 @@ extern "C" { GGML_OP_CLAMP, GGML_OP_CONV_1D, GGML_OP_CONV_2D, + GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, + GGML_OP_UPSCALE, // nearest interpolate + GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, GGML_OP_FLASH_ATTN_BACK, GGML_OP_WIN_PART, GGML_OP_WIN_UNPART, + GGML_OP_GET_REL_POS, + GGML_OP_ADD_REL_POS, GGML_OP_UNARY, @@ -458,6 +487,9 @@ extern "C" { int64_t perf_cycles; int64_t perf_time_us; + struct ggml_tensor * view_src; + size_t view_offs; + void * data; char name[GGML_MAX_NAME]; @@ -562,6 +594,7 @@ extern "C" { GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); GGML_API int ggml_blck_size (enum ggml_type type); @@ -639,7 +672,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); - GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); @@ -660,6 +693,7 @@ extern "C" { GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); + GGML_ATTRIBUTE_FORMAT(2, 3) GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); // @@ -799,6 +833,13 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // concat a and b on dim 2 + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_concat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_abs( struct ggml_context * ctx, struct ggml_tensor * a); @@ -888,14 +929,15 @@ extern "C" { struct ggml_tensor * b); // normalize along rows - // TODO: eps is hardcoded to 1e-5 for now GGML_API struct ggml_tensor * ggml_norm( struct ggml_context * ctx, - struct ggml_tensor * a); + struct ggml_tensor * a, + float eps); GGML_API struct ggml_tensor * ggml_norm_inplace( struct ggml_context * ctx, - struct ggml_tensor * a); + struct ggml_tensor * a, + float eps); GGML_API struct ggml_tensor * ggml_rms_norm( struct ggml_context * ctx, @@ -907,13 +949,26 @@ extern "C" { struct ggml_tensor * a, float eps); + // group normalize along ne0*ne1*n_groups + // used in stable-diffusion + // TODO: eps is hardcoded to 1e-6 for now + GGML_API struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + + GGML_API struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + // a - x // b - dy - // TODO: update with configurable eps GGML_API struct ggml_tensor * ggml_rms_norm_back( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + float eps); // A: n columns, m rows // B: n columns, p rows (i.e. we transpose it internally) @@ -1207,6 +1262,15 @@ extern "C" { float freq_base, float freq_scale); + // xPos RoPE, in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + float base, + bool down); + // rotary position embedding backward, i.e compute dx from dy // a - dy GGML_API struct ggml_tensor * ggml_rope_back( @@ -1215,7 +1279,11 @@ extern "C" { int n_past, int n_dims, int mode, - int n_ctx); + int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down); // alibi position embedding // in-place, returns view(a) @@ -1242,6 +1310,15 @@ extern "C" { int p0, // padding int d0); // dilation + // conv_1d with padding = half + // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + GGML_API struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d); + GGML_API struct ggml_tensor * ggml_conv_2d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1253,14 +1330,38 @@ extern "C" { int d0, int d1); - // conv_1d with padding = half - // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) - GGML_API struct ggml_tensor * ggml_conv_1d_ph( + + // kernel size is a->ne[0] x a->ne[1] + // stride is equal to kernel size + // padding is zero + // example: + // a: 16 16 3 768 + // b: 1024 1024 3 1 + // res: 64 64 768 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // kernel size is a->ne[0] x a->ne[1] + // stride is 1 + // padding is half + // example: + // a: 3 3 256 256 + // b: 64 64 256 1 + // res: 64 64 256 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - int s, - int d); + int stride); enum ggml_op_pool { GGML_OP_POOL_MAX, @@ -1287,6 +1388,13 @@ extern "C" { int p0, int p1); + // nearest interpolate + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor); + GGML_API struct ggml_tensor * ggml_flash_attn( struct ggml_context * ctx, struct ggml_tensor * q, @@ -1340,6 +1448,27 @@ extern "C" { struct ggml_tensor * a, enum ggml_unary_op op); + // used in sam + GGML_API struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh); + + // used in sam + + GGML_API struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + // custom operators typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); @@ -1495,7 +1624,8 @@ extern "C" { struct ggml_tensor * tensor); - GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); @@ -1560,6 +1690,8 @@ extern "C" { GGML_LINESEARCH_INVALID_PARAMETERS, }; + typedef void (*ggml_opt_callback)(void * data, float * sched); + // optimization parameters // // see ggml.c (ggml_opt_default_params) for default values @@ -1595,12 +1727,14 @@ extern "C" { float sched; // schedule multiplier (fixed, decay or warmup) float decay; // weight decay for AdamW, use 0.0f to disable + int decay_min_ndim; // minimum number of tensor dimension to apply weight decay float alpha; // learning rate float beta1; float beta2; float eps; // epsilon for numerical stability float eps_f; // epsilon for convergence test float eps_g; // epsilon for convergence test + float gclip; // gradient clipping } adam; // LBFGS parameters @@ -1628,14 +1762,12 @@ extern "C" { bool just_initialized; + float loss_before; + float loss_after; + struct { - struct ggml_tensor * x; // view of the parameters - struct ggml_tensor * g1; // gradient - struct ggml_tensor * g2; // gradient squared struct ggml_tensor * m; // first moment struct ggml_tensor * v; // second moment - struct ggml_tensor * mh; // first moment hat - struct ggml_tensor * vh; // second moment hat struct ggml_tensor * pf; // past function values float fx_best; float fx_prev; @@ -1672,10 +1804,10 @@ extern "C" { // initialize optimizer context GGML_API void ggml_opt_init( - struct ggml_context * ctx, + struct ggml_context * ctx, struct ggml_opt_context * opt, - struct ggml_opt_params params, - int64_t nx); + struct ggml_opt_params params, + int64_t nx); // continue optimizing the function defined by the tensor f GGML_API enum ggml_opt_result ggml_opt_resume( @@ -1689,7 +1821,9 @@ extern "C" { struct ggml_opt_context * opt, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb); + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data); // // quantization @@ -1703,6 +1837,127 @@ extern "C" { GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // + // gguf + // + + enum gguf_type { + GGUF_TYPE_UINT8 = 0, + GGUF_TYPE_INT8 = 1, + GGUF_TYPE_UINT16 = 2, + GGUF_TYPE_INT16 = 3, + GGUF_TYPE_UINT32 = 4, + GGUF_TYPE_INT32 = 5, + GGUF_TYPE_FLOAT32 = 6, + GGUF_TYPE_BOOL = 7, + GGUF_TYPE_STRING = 8, + GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, + GGUF_TYPE_COUNT, // marks the end of the enum + }; + + struct gguf_context; + + struct gguf_init_params { + bool no_alloc; + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API const char * gguf_type_name(enum gguf_type type); + + GGML_API int gguf_get_version (const struct gguf_context * ctx); + GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + GGML_API void * gguf_get_data (const struct gguf_context * ctx); + + GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); + GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); + GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i); + + GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i); + GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i); + + // results are undefined if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i); + GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i); + GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i); + GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i); + GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i); + GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i); + GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i); + GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i); + GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i); + GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i); + GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i); + GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i); + GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); + + GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + + // overrides existing values or adds a new one + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); + GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); + GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); + GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); + GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); + GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); + GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); + GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); + GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); + GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n); + + // set or add KV pairs from another context + GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); + + // manage tensor info + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); + + // writing gguf files can be done in 2 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); + // fwrite(f, ...); + // void * data = gguf_meta_get_meta_data(ctx); + // fseek(f, 0, SEEK_SET); + // fwrite(f, data, gguf_get_meta_size(ctx)); + // free(data); + // fclose(f); + // + + // write the entire context to a binary file + GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); + GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); + // // system info // @@ -1715,6 +1970,7 @@ extern "C" { GGML_API int ggml_cpu_has_fma (void); GGML_API int ggml_cpu_has_neon (void); GGML_API int ggml_cpu_has_arm_fma (void); + GGML_API int ggml_cpu_has_metal (void); GGML_API int ggml_cpu_has_f16c (void); GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_wasm_simd (void); @@ -1723,6 +1979,7 @@ extern "C" { GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); + GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_vsx (void); // @@ -1740,6 +1997,10 @@ extern "C" { typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef struct { + const char * type_name; + int blck_size; + size_t type_size; + bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float; ggml_from_float_t from_float_reference; @@ -1747,7 +2008,7 @@ extern "C" { enum ggml_type vec_dot_type; } ggml_type_traits_t; - ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i); + ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); #ifdef __cplusplus } diff --git a/gguf-py/LICENSE b/gguf-py/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..76f67efdc6470081b512a8db5bf2b1d4962d9c3c --- /dev/null +++ b/gguf-py/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Georgi Gerganov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/gguf-py/README.md b/gguf-py/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ffe25c495ab1ec04f36f7b97e9b50ad646ddc249 --- /dev/null +++ b/gguf-py/README.md @@ -0,0 +1,72 @@ +## gguf + +This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) +(GGML Universal File) format. + +See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py) +as an example for its usage. + +## Installation +```sh +pip install gguf +``` + +## Development +Maintainers who participate in development of this package are advised to install it in editable mode: + +```sh +cd /path/to/llama.cpp/gguf-py + +pip install --editable . +``` + +**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`. +In this case, upgrade Pip to the latest: + +```sh +pip install --upgrade pip +``` + +## Automatic publishing with CI + +There's a GitHub workflow to make a release automatically upon creation of tags in a specified format. + +1. Bump the version in `pyproject.toml`. +2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number. + +```sh +git tag -a gguf-v1.0.0 -m "Version 1.0 release" +``` + +3. Push the tags. + +```sh +git push origin --tags +``` + +## Manual publishing +If you want to publish the package manually for any reason, you need to have `twine` and `build` installed: + +```sh +pip install build twine +``` + +Then, folow these steps to release a new version: + +1. Bump the version in `pyproject.toml`. +2. Build the package: + +```sh +python -m build +``` + +3. Upload the generated distribution archives: + +```sh +python -m twine upload dist/* +``` + +## TODO +- [ ] Add tests +- [ ] Include conversion scripts as command line entry points in this package. +- Add CI workflow for releasing the package. diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f9b70a85b875e3626c518c321156abc5588b8ffe --- /dev/null +++ b/gguf-py/gguf/__init__.py @@ -0,0 +1 @@ +from .gguf import * diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..e0e0dbcbbe840d00f888002245e0e2ee9f71ca35 --- /dev/null +++ b/gguf-py/gguf/gguf.py @@ -0,0 +1,898 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import os +import shutil +import struct +import sys +import tempfile +from enum import IntEnum, auto +from io import BufferedWriter +from pathlib import Path +from typing import IO, Any, BinaryIO, Callable, Sequence + +import numpy as np + +# +# constants +# + +GGUF_MAGIC = 0x46554747 +GGUF_VERSION = 2 +GGUF_DEFAULT_ALIGNMENT = 32 + +# general +KEY_GENERAL_ARCHITECTURE = "general.architecture" +KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version" +KEY_GENERAL_ALIGNMENT = "general.alignment" +KEY_GENERAL_NAME = "general.name" +KEY_GENERAL_AUTHOR = "general.author" +KEY_GENERAL_URL = "general.url" +KEY_GENERAL_DESCRIPTION = "general.description" +KEY_GENERAL_LICENSE = "general.license" +KEY_GENERAL_SOURCE_URL = "general.source.url" +KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository" +KEY_GENERAL_FILE_TYPE = "general.file_type" + +# LLM +KEY_CONTEXT_LENGTH = "{arch}.context_length" +KEY_EMBEDDING_LENGTH = "{arch}.embedding_length" +KEY_BLOCK_COUNT = "{arch}.block_count" +KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" +KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" +KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + +# attention +KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count" +KEY_ATTENTION_HEAD_COUNT_KV = "{arch}.attention.head_count_kv" +KEY_ATTENTION_MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" +KEY_ATTENTION_CLAMP_KQV = "{arch}.attention.clamp_kqv" +KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" +KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + +# RoPE +KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" +KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base" +KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear" + +# tokenization +KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" +KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens" +KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type" +KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores" +KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges" +KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id" +KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id" +KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id" +KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id" +KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id" +KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json" +KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" + + +# +# recommended mapping of model tensor names for storage in gguf +# + + +class MODEL_ARCH(IntEnum): + LLAMA : int = auto() + FALCON : int = auto() + BAICHUAN : int = auto() + GPT2 : int = auto() + GPTJ : int = auto() + GPTNEOX : int = auto() + MPT : int = auto() + STARCODER : int = auto() + + +class MODEL_TENSOR(IntEnum): + TOKEN_EMBD : int = auto() + POS_EMBD : int = auto() + OUTPUT : int = auto() + OUTPUT_NORM : int = auto() + ROPE_FREQS : int = auto() + ATTN_Q : int = auto() + ATTN_K : int = auto() + ATTN_V : int = auto() + ATTN_QKV : int = auto() + ATTN_OUT : int = auto() + ATTN_NORM : int = auto() + ATTN_NORM_2 : int = auto() + ATTN_ROT_EMBD: int = auto() + FFN_GATE : int = auto() + FFN_DOWN : int = auto() + FFN_UP : int = auto() + FFN_NORM : int = auto() + + +MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", +} + +MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { + MODEL_ARCH.LLAMA: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, + MODEL_ARCH.GPTNEOX: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, + MODEL_ARCH.FALCON: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, + MODEL_ARCH.BAICHUAN: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, + MODEL_ARCH.STARCODER: { + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + }, + MODEL_ARCH.GPT2: { + # TODO + }, + # TODO +} + +# tensors that will not be serialized +MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.LLAMA: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], + MODEL_ARCH.BAICHUAN: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], +} + + +class TensorNameMap: + mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { + # Token embeddings + MODEL_TENSOR.TOKEN_EMBD: ( + "gpt_neox.embed_in", # gptneox + "transformer.wte", # gpt2 mpt + "transformer.word_embeddings", # falcon + "model.embed_tokens", # llama-hf + "tok_embeddings", # llama-pth + ), + + # Position embeddings + MODEL_TENSOR.POS_EMBD: ( + "transformer.wpe", # gpt2 + ), + + # Output + MODEL_TENSOR.OUTPUT: ( + "embed_out", # gptneox + "lm_head", # gpt2 mpt falcon llama-hf baichuan + "output", # llama-pth + ), + + # Output norm + MODEL_TENSOR.OUTPUT_NORM: ( + "gpt_neox.final_layer_norm", # gptneox + "transformer.ln_f", # gpt2 falcon + "model.norm", # llama-hf baichuan + "norm", # llama-pth + ), + + # Rope frequencies + MODEL_TENSOR.ROPE_FREQS: ( + "rope.freqs", # llama-pth + ), + } + + block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { + # Attention norm + MODEL_TENSOR.ATTN_NORM: ( + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf + "layers.{bid}.attention_norm", # llama-pth + ), + + # Attention norm 2 + MODEL_TENSOR.ATTN_NORM_2: ( + "transformer.h.{bid}.ln_attn", # falcon40b + ), + + # Attention query-key-value + MODEL_TENSOR.ATTN_QKV: ( + "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox + "transformer.h.{bid}.attn.c_attn", # gpt2 + "transformer.blocks.{bid}.attn.Wqkv", # mpt + "transformer.h.{bid}.self_attention.query_key_value", # falcon + ), + + # Attention query + MODEL_TENSOR.ATTN_Q: ( + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + ), + + # Attention key + MODEL_TENSOR.ATTN_K: ( + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + ), + + # Attention value + MODEL_TENSOR.ATTN_V: ( + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + ), + + # Attention output + MODEL_TENSOR.ATTN_OUT: ( + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "model.layers.{bid}.self_attn.o_proj", # llama-hf + "layers.{bid}.attention.wo", # llama-pth + ), + + # Rotary embeddings + MODEL_TENSOR.ATTN_ROT_EMBD: ( + "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf + "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth + ), + + # Feed-forward norm + MODEL_TENSOR.FFN_NORM: ( + "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox + "transformer.h.{bid}.ln_2", # gpt2 + "transformer.blocks.{bid}.norm_2", # mpt + "model.layers.{bid}.post_attention_layernorm", # llama-hf + "layers.{bid}.ffn_norm", # llama-pth + ), + + # Feed-forward up + MODEL_TENSOR.FFN_UP: ( + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox + "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.blocks.{bid}.ffn.up_proj", # mpt + "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "model.layers.{bid}.mlp.up_proj", # llama-hf + "layers.{bid}.feed_forward.w3", # llama-pth + ), + + # Feed-forward gate + MODEL_TENSOR.FFN_GATE: ( + "model.layers.{bid}.mlp.gate_proj", # llama-hf + "layers.{bid}.feed_forward.w1", # llama-pth + ), + + # Feed-forward down + MODEL_TENSOR.FFN_DOWN: ( + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox + "transformer.h.{bid}.mlp.c_proj", # gpt2 + "transformer.blocks.{bid}.ffn.down_proj", # mpt + "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "model.layers.{bid}.mlp.down_proj", # llama-hf + "layers.{bid}.feed_forward.w2", # llama-pth + ), + } + + mapping: dict[str, tuple[MODEL_TENSOR, str]] + + tensor_names: dict[MODEL_TENSOR, str] + + def __init__(self, arch: MODEL_ARCH, n_blocks: int): + mapping = self.mapping = {} + tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch] + for tensor, keys in self.mappings_cfg.items(): + tensor_name = tensor_names.get(tensor) + if tensor_name is None: + continue + mapping[tensor_name] = (tensor, tensor_name) + for key in keys: + mapping[key] = (tensor, tensor_name) + for bid in range(n_blocks): + for tensor, keys in self.block_mappings_cfg.items(): + tensor_name = tensor_names.get(tensor) + if tensor_name is None: + continue + tensor_name = tensor_name.format(bid = bid) + mapping[tensor_name] = (tensor, tensor_name) + for key in keys: + key = key.format(bid = bid) + mapping[key] = (tensor, tensor_name) + + def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: + result = self.mapping.get(key) + if result is not None: + return result + for suffix in try_suffixes: + if key.endswith(suffix): + result = self.mapping.get(key[:-len(suffix)]) + if result is not None: + return (result[0], result[1] + suffix) + return None + + def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) + if result is None: + return None + return result[1] + + def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) + if result is None: + return None + return result[0] + + def __getitem__(self, key: str) -> str: + try: + return self.mapping[key][1] + except KeyError: + raise KeyError(key) + + def __contains__(self, key: str) -> bool: + return key in self.mapping + + def __repr__(self) -> str: + return repr(self.mapping) + +def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: + return TensorNameMap(arch, n_blocks) + +class TokenType(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + +# +# implementation +# + + +class GGMLQuantizationType(IntEnum): + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 + + +class GGUFValueType(IntEnum): + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 + FLOAT32 = 6 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 + FLOAT64 = 12 + + @staticmethod + def get_type(val): + if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray): + return GGUFValueType.STRING + elif isinstance(val, list): + return GGUFValueType.ARRAY + elif isinstance(val, float): + return GGUFValueType.FLOAT32 + elif isinstance(val, bool): + return GGUFValueType.BOOL + elif isinstance(val, int): + return GGUFValueType.INT32 + # TODO: need help with 64-bit types in Python + else: + print("Unknown type: "+str(type(val))) + sys.exit() + + +class GGUFWriter: + fout: BufferedWriter + arch: str + offset_tensor = 0 + data_alignment = GGUF_DEFAULT_ALIGNMENT + kv_data = b"" + kv_data_count = 0 + ti_data = b"" + ti_data_count = 0 + use_temp_file: bool + temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None + tensors: list[tuple[np.ndarray[Any, Any], int]] + + def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True): + self.fout = open(path, "wb") + self.arch = arch + self.add_architecture() + self.use_temp_file = use_temp_file + self.tensors = [] + + def write_header_to_file(self): + self.fout.write(struct.pack(" 0: + ltype = GGUFValueType.get_type(val[0]) + if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): + raise ValueError("All items in a GGUF array should be of the same type") + self.kv_data += struct.pack(" int: + return ((x + n - 1) // n) * n + + def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None): + assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" + + encoded_name = name.encode("utf8") + self.ti_data += struct.pack(" bool: + tokenizer_file = path / 'tokenizer.json' + if not tokenizer_file.is_file(): + return False + with open(tokenizer_file, 'r', encoding = 'utf-8') as f: + tokenizer = json.load(f) + if self.load_merges: + merges = tokenizer.get('model', {}).get('merges') + if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str): + self.merges = merges + tokenizer_config_file = path / 'tokenizer_config.json' + added_tokens = tokenizer.get('added_tokens') + if added_tokens is None or not tokenizer_config_file.is_file(): + return True + with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f: + tokenizer_config = json.load(f) + for typ in self.special_token_types: + entry = tokenizer_config.get(f'{typ}_token') + if isinstance(entry, str): + tc_content = entry + elif isinstance(entry, dict): + entry_content = entry.get('content') + if not isinstance(entry_content, str): + continue + tc_content = entry_content + else: + continue + for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content): + if isinstance(maybe_token_id, int) and maybe_token_id >= 0: + self.special_token_ids[typ] = maybe_token_id + break + return True + + def try_load_from_config_json(self, path: Path) -> bool: + config_file = path / 'config.json' + if not config_file.is_file(): + return False + with open(config_file, 'r', encoding = 'utf-8') as f: + config = json.load(f) + for typ in self.special_token_types: + maybe_token_id = config.get(f'{typ}_token_id') + if isinstance(maybe_token_id, int) and maybe_token_id >= 0: + self.special_token_ids[typ] = maybe_token_id + return True + + def add_to_gguf(self, gw: GGUFWriter): + if len(self.merges) > 0: + print(f'gguf: Adding {len(self.merges)} merge(s).') + gw.add_token_merges(self.merges) + for typ, tokid in self.special_token_ids.items(): + handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) + if handler is None: + print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping') + continue + print(f'gguf: Setting special token type {typ} to {tokid}') + handler(tokid) + + def __repr__(self): + return f'' + + +# Example usage: +if __name__ == "__main__": + # Example usage with a file + gguf_writer = GGUFWriter("example.gguf", "llama") + + gguf_writer.add_architecture() + gguf_writer.add_block_count(12) + gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer + gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float + gguf_writer.add_custom_alignment(64) + + tensor1 = np.ones((32,), dtype=np.float32) * 100.0 + tensor2 = np.ones((64,), dtype=np.float32) * 101.0 + tensor3 = np.ones((96,), dtype=np.float32) * 102.0 + + gguf_writer.add_tensor("tensor1", tensor1) + gguf_writer.add_tensor("tensor2", tensor2) + gguf_writer.add_tensor("tensor3", tensor3) + + gguf_writer.write_header_to_file() + gguf_writer.write_kv_data_to_file() + gguf_writer.write_tensors_to_file() + + gguf_writer.close() diff --git a/gguf-py/gguf/py.typed b/gguf-py/gguf/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..9489ccd6f2c011ee2ecc0d396ae53400f3ef7e74 --- /dev/null +++ b/gguf-py/pyproject.toml @@ -0,0 +1,29 @@ +[tool.poetry] +name = "gguf" +version = "0.3.3" +description = "Write ML models in GGUF for GGML" +authors = ["GGML "] +packages = [ + {include = "gguf"}, + {include = "gguf/py.typed"}, +] +readme = "README.md" +homepage = "https://ggml.ai" +repository = "https://github.com/ggerganov/llama.cpp" +keywords = ["ggml", "gguf", "llama.cpp"] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +[tool.poetry.dependencies] +python = ">=3.8" +numpy = ">=1.17" + +[tool.poetry.dev-dependencies] +pytest = "^5.2" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/gguf-py/tests/test_gguf.py b/gguf-py/tests/test_gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..512531dd2a8f0a836fa46af6edc0c4f0e0e48667 --- /dev/null +++ b/gguf-py/tests/test_gguf.py @@ -0,0 +1,7 @@ +import gguf + +# TODO: add tests + + +def test_write_gguf(): + pass diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index a2283269c0525a682319cb5d5cdef69bdf62515f..fb0f205f85691cd47878432c11f6e28825038b4a 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -8,12 +8,15 @@ //Python will ALWAYS provide the memory, we just write to it. #include +#include #include "model_adapter.h" #include "otherarch.h" +#include "grammar-parser.h" //for easier compilation //concat source files into one file for compilation purposes #include "llama_v2.cpp" +#include "llama_v3.cpp" #include "llama.cpp" #include "utils.cpp" #include "gptj_v1.cpp" @@ -39,10 +42,14 @@ int last_token_count = 0; stop_reason last_stop_reason = stop_reason::INVALID; std::vector generated_tokens; +llama_grammar * grammar = nullptr; //currently used grammar +grammar_parser::parse_state parsed_grammar; + //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) static FileFormat file_format = FileFormat::BADFORMAT; static gpt_vocab vocab; +static int32_t n_vocab = 0; static gptj_v1_model gptj_ctx_v1; static gptj_v2_model gptj_ctx_v2; @@ -59,10 +66,10 @@ static mpt_model mpt_ctx_v3; static rwkv_v2_context * rwkv_ctx_v2; static rwkv_context * rwkv_ctx_v3; -static llama_v2_context_params llama_ctx_params_v2; -static llama_context_params llama_ctx_params; + static llama_v2_context * llama_ctx_v2; -static llama_context * llama_ctx_v3; +static llama_v3_context * llama_ctx_v3; +static llama_context * llama_ctx_v4; static gpt_params params; static int n_past = 0; @@ -85,7 +92,9 @@ static std::vector banned_token_ids; static std::vector top_picks; static int remaining_tokens = 0; static int stopper_unused_tokens = 0; +static std::mutex concat_output_mtx; static std::string concat_output = ""; +static std::string concat_output_reader_copy = ""; inline bool IsNanCheck(float f) { @@ -112,6 +121,133 @@ inline bool LogitsDuplicated(std::vector & arr1, std::vector & arr } +static std::string FileFormatTokenizeID(int id, FileFormat file_format) +{ + if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) + { + return std::string(llama_v2_token_to_str(llama_ctx_v2, id)); + } + else if (file_format == FileFormat::GGJT_3) + { + return std::string(llama_v3_token_to_str(llama_ctx_v3, id)); + } + else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + { + return std::string(llama_token_to_str(llama_ctx_v4, id)); + } + else + { + return vocab.id_to_token[id]; + } +} + +static void TokenizeString(const std::string & str_to_tokenize, std::vector & output_tokens, FileFormat file_format) +{ + if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + { + if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 ) + { + output_tokens = ::llama_v2_tokenize(llama_ctx_v2, str_to_tokenize, true); + } + else if (file_format == FileFormat::GGML) + { + output_tokens = ::legacy_llama_v2_tokenize(llama_ctx_v2, str_to_tokenize, true); + } + else if (file_format == FileFormat::GGJT_3) + { + output_tokens = ::llama_v3_tokenize(llama_ctx_v3, str_to_tokenize, true); + } + else + { + output_tokens = ::llama_tokenize(llama_ctx_v4, str_to_tokenize, true); + } + } + else + { + // tokenize the prompt + output_tokens = ::gpt_tokenize(vocab, str_to_tokenize); + } +} +static int GetEosID(FileFormat file_format, int32_t n_vocab) +{ + unsigned int eosID = 0; + + if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + { + if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + { + eosID = llama_token_eos(llama_ctx_v4); + } + else if(file_format == FileFormat::GGJT_3) + { + eosID = llama_v3_token_eos(); + } + else + { + eosID = llama_v3_token_eos(); + } + } + else + { + if (file_format == FileFormat::GPT2_1 || + file_format == FileFormat::GPT2_2 || + file_format == FileFormat::GPT2_3 || + file_format == FileFormat::GPT2_4 || + file_format == FileFormat::GPTJ_1 || + file_format == FileFormat::GPTJ_2 || + file_format == FileFormat::GPTJ_3 || + file_format == FileFormat::GPTJ_4 || + file_format == FileFormat::GPTJ_5) + { + eosID = 50256; + if (n_vocab <= eosID) + { + //special case, starcoder models use ID 0 for EOS + eosID = 0; + } + } + + if (file_format == FileFormat::RWKV_1 || + file_format == FileFormat::RWKV_2 || + file_format == FileFormat::NEOX_1 || + file_format == FileFormat::NEOX_2 || + file_format == FileFormat::NEOX_3 || + file_format == FileFormat::NEOX_4 || + file_format == FileFormat::NEOX_5 || + file_format == FileFormat::NEOX_6 || + file_format == FileFormat::NEOX_7 || + file_format == FileFormat::MPT_1) + { + eosID = 0; + } + } + return eosID; +} +static float LowestLogit(const std::vector & logits) +{ + int topid = std::min_element(logits.begin(), logits.end()) - logits.begin(); + float v = logits[topid]; + return (v < 0 ? (v-8) : 0); +} +static float LowestLogit(const float *logits, size_t size) +{ + if (size == 0) { + // Handle the case of an empty array + return 0.0; + } + int topid = std::min_element(logits, logits + size) - logits; + float v = logits[topid]; + return (v < 0 ? (v-8) : 0); +} + +static std::string RemoveBell(const std::string & input) //removes the bell character +{ + std::string word2; + std::remove_copy(input.begin(), input.end(), std::back_inserter(word2), '\a'); + return word2; +} + + llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng) { llama_sample_softmax(nullptr, candidates); @@ -253,8 +389,47 @@ void sample_temperature(llama_token_data_array * candidates_p, float temp) } } +void sample_grammar(FileFormat file_format, int32_t n_vocab, llama_token_data_array * candidates, const struct llama_grammar * grammar) { + + const int64_t t_start_sample_us = ggml_time_us(); + + bool allow_eos = false; + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + allow_eos = true; + break; + } + } + + const llama_token eos = GetEosID(file_format,n_vocab); + + std::vector, llama_partial_utf8>> candidates_decoded; + std::vector candidates_grammar; + + for (size_t i = 0; i < candidates->size; ++i) { + const llama_token id = candidates->data[i].id; + const std::string piece = FileFormatTokenizeID(id,file_format); + if (id == eos) { + if (!allow_eos) { + candidates->data[i].logit = -INFINITY; + } + } else if (piece.empty() || piece[0] == 0) { + candidates->data[i].logit = -INFINITY; + } else { + candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8)); + candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); + } + } + + const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); + for (const auto & reject : rejects) { + candidates->data[reject.index].logit = -INFINITY; + } + +} + int SampleLogits(const float * logits, int n_ctx, int n_vocab, int rep_pen_range, float rep_pen, float top_k, float top_a, float top_p, float typical_p, float tfs, float temp, std::mt19937 & rng, -int mirostat, float mirostat_tau, float mirostat_eta, const std::vector & sampler_order) +int mirostat, float mirostat_tau, float mirostat_eta, const std::vector & sampler_order, llama_grammar * grammar) { int id = 0; std::vector candidates; @@ -265,6 +440,10 @@ int mirostat, float mirostat_tau, float mirostat_eta, const std::vectorstacks) { + if (stack.empty()) { + return; + } + } + GGML_ASSERT(false); } - else if (file_format == FileFormat::GGJT_3) - { - return std::string(llama_token_to_str(llama_ctx_v3, id)); + const std::string piece = FileFormatTokenizeID(token,file_format); //llama_token_to_str(ctx, token); + + // Note terminating 0 in decoded string + const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); + const auto & code_points = decoded.first; + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); } - else + grammar->partial_utf8 = decoded.second; + GGML_ASSERT(!grammar->stacks.empty()); +} + +static void load_grammar(const std::string & gammarstr) +{ + if(grammar!=nullptr) //on demand free when next grammar is loaded { - return vocab.id_to_token[id]; + llama_grammar_free(grammar); + grammar = nullptr; + } + + if (!gammarstr.empty()) { + parsed_grammar = grammar_parser::parse(gammarstr.c_str()); + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + printf("\nIgnored invalid grammar sampler."); + return; + } + if(debugmode==1) + { + grammar_parser::print_grammar(stderr, parsed_grammar); + } + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); } } -ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format) +ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta file_format_meta) { ggml_time_init(); @@ -378,7 +588,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in else { //approximate NTK aware ctx - rope_freq_base = (params.n_ctx <= 3072 ? 26000.0f : (params.n_ctx <= 4096 ? 32000.0f : (params.n_ctx <= 6144 ? 54000.0f : (params.n_ctx <= 8192 ? 82684.0f : (params.n_ctx <= 12288 ? 140000.0f : 200000.0f))))); + auto effectivenctx = params.n_ctx; + if((file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) && file_format_meta.n_ctx_train > 2048) + { + float factor = file_format_meta.n_ctx_train/2048; + effectivenctx = effectivenctx/factor; + } + rope_freq_base = (effectivenctx <= 2048 ? 10000.0f : (effectivenctx <= 3072 ? 26000.0f : (effectivenctx <= 4096 ? 32000.0f : (effectivenctx <= 6144 ? 54000.0f : (effectivenctx <= 8192 ? 82684.0f : (effectivenctx <= 12288 ? 140000.0f : 200000.0f)))))); } @@ -401,7 +617,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //this is used for the mem_per_token eval, openblas needs more RAM bool use_scratch = ggml_cpu_has_gpublas(); - int cu_parseinfo_maindevice = inputs.cublas_info<0?0:inputs.cublas_info; + int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info; printf("System Info: %s\n", llama_print_system_info()); #if defined(GGML_USE_CUBLAS) @@ -416,8 +632,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { //newer format has bit unshuffling SetQuantsUnshuffled(file_format == FileFormat::GGJT_2); - - llama_ctx_params_v2 = llama_v2_context_default_params(); + llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params(); llama_ctx_params_v2.n_ctx = inputs.max_context_length; //llama_ctx_params.n_parts = -1; llama_ctx_params_v2.seed = -1; @@ -458,6 +673,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } } + n_vocab = llama_v2_n_vocab(llama_ctx_v2); + //determine mem per token const std::vector tmp = {1, 2, 3, 4}; llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads); @@ -465,7 +682,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else if(file_format == FileFormat::GGJT_3) { - llama_ctx_params = llama_context_default_params(); + llama_v3_context_params llama_ctx_params = llama_v3_context_default_params(); llama_ctx_params.n_ctx = inputs.max_context_length; //llama_ctx_paran_parts = -1; llama_ctx_params.seed = -1; @@ -492,11 +709,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in if(!ts_all_zero) { llama_ctx_params.tensor_split = inputs.tensor_split; - printf("CUBLAS: Applying Custom Tensor Split!\n"); } #endif - llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params); + llama_ctx_v3 = llama_v3_init_from_file(modelname.c_str(), llama_ctx_params); if (llama_ctx_v3 == NULL) { @@ -513,7 +729,85 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in lora_base_arg = lora_base.c_str(); } - int err = llama_apply_lora_from_file(llama_ctx_v3, + int err = llama_v3_apply_lora_from_file(llama_ctx_v3, + lora_filename.c_str(), + lora_base_arg, + n_threads); + if (err != 0) + { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return ModelLoadResult::FAIL; + } + } + + n_vocab = llama_v3_n_vocab(llama_ctx_v3); + + //determine mem per token + const std::vector tmp = {1, 2, 3, 4}; + auto er = llama_v3_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads); + if(er!=0) + { + printf("\nLLAMA EVAL returned nonzero!\n"); + } + return ModelLoadResult::SUCCESS; + } + else if(file_format==FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + { + llama_context_params llama_ctx_params = llama_context_default_params(); + llama_ctx_params.n_ctx = inputs.max_context_length; + //llama_ctx_paran_parts = -1; + llama_ctx_params.seed = -1; + llama_ctx_params.f16_kv = inputs.f16_kv; + llama_ctx_params.low_vram = inputs.low_vram; + llama_ctx_params.mul_mat_q = inputs.use_mmq; + llama_ctx_params.logits_all = false; + llama_ctx_params.use_mmap = inputs.use_mmap; + llama_ctx_params.use_mlock = inputs.use_mlock; + llama_ctx_params.n_gpu_layers = inputs.gpulayers; + #if defined(GGML_USE_CLBLAST) + if(file_format==FileFormat::GGUF_FALCON && llama_ctx_params.n_gpu_layers>0) + { + printf("\nGPU layer offload for GGUF FALCON on OpenCL is known to have issues, it has been set to 0.\n"); + llama_ctx_params.n_gpu_layers = 0; + } + #endif + llama_ctx_params.main_gpu = cu_parseinfo_maindevice; + llama_ctx_params.rope_freq_base = rope_freq_base; + llama_ctx_params.rope_freq_scale = rope_freq_scale; + llama_ctx_params.n_batch = blasbatchsize; + + #if defined(GGML_USE_CUBLAS) + bool ts_all_zero = true; + for (int i = 0; i < tensor_split_max; ++i) { + if (inputs.tensor_split[i] != 0.0f) { + ts_all_zero = false; + break; + } + } + if(!ts_all_zero) + { + llama_ctx_params.tensor_split = inputs.tensor_split; + } + #endif + + llama_ctx_v4 = llama_init_from_file(modelname.c_str(), llama_ctx_params); + + if (llama_ctx_v4 == NULL) + { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str()); + return ModelLoadResult::FAIL; + } + if (lora_filename != "") + { + printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str()); + + const char * lora_base_arg = NULL; + if (lora_base != "") { + printf("Using LORA base model: %s\n", lora_base.c_str()); + lora_base_arg = lora_base.c_str(); + } + + int err = llama_apply_lora_from_file(llama_ctx_v4, lora_filename.c_str(), lora_base_arg, n_threads); @@ -524,9 +818,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } } + n_vocab = llama_n_vocab(llama_ctx_v4); + //determine mem per token const std::vector tmp = {1, 2, 3, 4}; - auto er = llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads); + auto er = llama_eval(llama_ctx_v4, tmp.data(), tmp.size(), 0, params.n_threads); if(er!=0) { printf("\nLLAMA EVAL returned nonzero!\n"); @@ -552,7 +848,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in const struct rwkv_file_header & header = rwkv_ctx_v3->instance->model.header; const size_t n_vocab = header.n_vocab; - printf("\nDetected Vocab: %d",n_vocab); + printf("\nDetected Vocab: %zu",n_vocab); if(n_vocab>60000) { printf("\nUsing WORLD TOKENIZER"); @@ -581,6 +877,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("\nRWKV Vocab: %u\n", vocabsiz); logits.resize(vocabsiz); + n_vocab = vocab.id_to_token.size(); //handled seperately + if (file_format == FileFormat::RWKV_1) { n_batch = 1; @@ -590,7 +888,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in auto statebufsiz = rwkv_v2_get_state_buffer_element_count(rwkv_ctx_v2) * sizeof(float) + padding; auto logitbufsiz = rwkv_v2_get_logits_buffer_element_count(rwkv_ctx_v2) * sizeof(float) + padding; - printf("\nRWKV old Init: State Buffer:%u, Logit Buffer:%u\n", statebufsiz, logitbufsiz); + printf("\nRWKV old Init: State Buffer:%lu, Logit Buffer:%lu\n", statebufsiz, logitbufsiz); rwkv_ctx_v2->state_out = (float *)malloc(statebufsiz); rwkv_ctx_v2->logits_out = (float *)malloc(logitbufsiz); rwkv_ctx_v2->state_in = nullptr; @@ -618,7 +916,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in auto statebufsiz = rwkv_get_state_buffer_element_count(rwkv_ctx_v3) * sizeof(float) + padding; auto logitbufsiz = rwkv_get_logits_buffer_element_count(rwkv_ctx_v3) * sizeof(float) + padding; - printf("\nRWKV Init: State Buffer:%u, Logit Buffer:%u\n", statebufsiz, logitbufsiz); + printf("\nRWKV Init: State Buffer:%lu, Logit Buffer:%lu\n", statebufsiz, logitbufsiz); rwkv_ctx_v3->state_out = (float *)malloc(statebufsiz); rwkv_ctx_v3->logits_out = (float *)malloc(logitbufsiz); rwkv_ctx_v3->state_in = nullptr; @@ -651,6 +949,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("\nTensor Transposition Detected! Retrying GPT-2 model loading..."); return res; } + + n_vocab = gpt2_ctx_v1.hparams.n_vocab; + // determine the required inference memory per token: legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); return ModelLoadResult::SUCCESS; @@ -670,6 +971,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("\nTensor Transposition Detected! Retrying GPT-2 model loading..."); return res; } + + n_vocab = gpt2_ctx_v3.hparams.n_vocab; + // determine the required inference memory per token: gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch); return ModelLoadResult::SUCCESS; @@ -690,6 +994,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("\nTensor Transposition Detected! Retrying GPT-2 model loading..."); return res; } + + n_vocab = gpt2_ctx_v2.hparams.n_vocab; + // determine the required inference memory per token: gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); return ModelLoadResult::SUCCESS; @@ -708,6 +1015,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("\nTensor Transposition Detected! Retrying GPT-J model loading..."); return res; } + + n_vocab = gptj_ctx_v1.hparams.n_vocab; + // determine the required inference memory per token: legacy_gptj_eval(gptj_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); @@ -737,6 +1047,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return loadresult; } + n_vocab = gptj_ctx_v3.hparams.n_vocab; + // determine the required inference memory per token: gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch); @@ -773,6 +1085,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return loadresult; } + n_vocab = gptj_ctx_v2.hparams.n_vocab; + // determine the required inference memory per token: gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); @@ -809,6 +1123,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return res; } + n_vocab = neox_ctx_v3.hparams.n_vocab; + // determine the required inference memory per token: gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch); @@ -831,6 +1147,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return res; } + n_vocab = neox_ctx_v2.hparams.n_vocab; + // determine the required inference memory per token: gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); @@ -866,6 +1184,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return ModelLoadResult::FAIL; } + n_vocab = mpt_ctx_v3.hparams.n_vocab; + // determine the required inference memory per token: mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch); return ModelLoadResult::SUCCESS; @@ -885,14 +1205,36 @@ bool gpttype_generate_abort() return true; } +int gpttype_token_count(const std::string & input) +{ + if(debugmode==1) + { + printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str()); + } + std::vector toks; + TokenizeString(input, toks, file_format); + int tokcount = toks.size(); + if(debugmode==1) + { + printf("\nTokens Counted: %d\n",tokcount); + } + return tokcount; +} + const std::string & gpttype_get_pending_output() { - return concat_output; + concat_output_mtx.lock(); + concat_output_reader_copy = concat_output; + concat_output_mtx.unlock(); + return concat_output_reader_copy; } generation_outputs gpttype_generate(const generation_inputs inputs, generation_outputs &output) { + concat_output_mtx.lock(); concat_output = ""; + concat_output_reader_copy = ""; + concat_output_mtx.unlock(); last_stop_reason = stop_reason::OUT_OF_TOKENS; stop_sequence.clear(); for(int x=0;x embd_inp; - - if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3) - { - params.prompt.insert(0, 1, ' '); - if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 ) - { - embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true); - } - else if (file_format == FileFormat::GGML) - { - embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true); - } - else - { - embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true); - } - } - else - { - // tokenize the prompt - embd_inp = ::gpt_tokenize(vocab, params.prompt); - } + TokenizeString(params.prompt, embd_inp, file_format); //truncate to front of the prompt if its too long int32_t nctx = params.n_ctx; @@ -1004,7 +1328,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { //for non llama, limit to 256 int bbs = blasbatchsize; - if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3) + if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON) { bbs = (blasbatchsize > 256 ? 256 : blasbatchsize); } @@ -1054,55 +1378,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o timer_start(); double time1 = 0, time2 = 0; - int32_t n_vocab = 0; - if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) - { - n_vocab = llama_v2_n_vocab(llama_ctx_v2); - } - else if(file_format == FileFormat::GGJT_3) - { - n_vocab = llama_n_vocab(llama_ctx_v3); - } - else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2) - { - n_vocab = gptj_ctx_v1.hparams.n_vocab; - } - else if(file_format == FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4) - { - n_vocab = gptj_ctx_v2.hparams.n_vocab; - } - else if(file_format==FileFormat::GPTJ_5) - { - n_vocab = gptj_ctx_v3.hparams.n_vocab; - } - else if(file_format == FileFormat::GPT2_1) - { - n_vocab = gpt2_ctx_v1.hparams.n_vocab; - } - else if(file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3) + if(file_format == FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2) { - n_vocab = gpt2_ctx_v2.hparams.n_vocab; - } - else if(file_format==FileFormat::GPT2_4) - { - n_vocab = gpt2_ctx_v3.hparams.n_vocab; - } - else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) - { - n_vocab = neox_ctx_v2.hparams.n_vocab; - } - else if( file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7) - { - n_vocab = neox_ctx_v3.hparams.n_vocab; - } - else if( file_format==FileFormat::MPT_1) - { - n_vocab = mpt_ctx_v3.hparams.n_vocab; - } - else if(file_format == FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2) - { - n_vocab = vocab.id_to_token.size(); //handled seperately if(n_past==0) { if(file_format == FileFormat::RWKV_1) @@ -1133,15 +1411,16 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } } } - else + + if(n_vocab<=0) { - printf("Bad format!"); + printf("\nWarning! n_vocab is invalid, maybe bad format!"); } //prepare banned tokens if(banned_token_ids.size()==0 && banned_tokens.size()>0) { - printf("\n[First Run] Banning %d token sequences...",banned_tokens.size()); + printf("\n[First Run] Banning %zu token sequences...",banned_tokens.size()); for(int v=0;v 0) @@ -1194,7 +1473,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o //print progress if (!startedsampling && debugmode!=-1) { - printf("\rProcessing Prompt%s (%d / %d tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size()); + printf("\rProcessing Prompt%s (%d / %zu tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size()); } fflush(stdout); @@ -1209,7 +1488,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } else if(file_format == FileFormat::GGJT_3) { - evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0); + evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0); + } + else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + { + evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past, params.n_threads)==0); } else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2) { @@ -1312,102 +1595,52 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } } - unsigned int eosID = 0; + unsigned int eosID = GetEosID(file_format, n_vocab); float * logitsPtr; + float lowestLogit = 0; int btsize = banned_token_ids.size(); - if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3) + if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { - if(file_format == FileFormat::GGJT_3) + if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) { - logitsPtr = llama_get_logits(llama_ctx_v3); + logitsPtr = llama_get_logits(llama_ctx_v4); } - else + else if(file_format == FileFormat::GGJT_3) { - logitsPtr = llama_v2_get_logits(llama_ctx_v2); + logitsPtr = llama_v3_get_logits(llama_ctx_v3); } - - eosID = llama_token_eos(); - - if (!unbanTokens) - { - // set the logit of the eos token (2) to zero to avoid sampling it - logitsPtr[eosID] = 0; - } - - if(btsize>0) + else { - for(int t=0;t eosID) - { - int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); - logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0); - } - else - { - //special case, starcoder models use ID 0 for EOS - if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4) - { - eosID = 0; - int topid = std::min_element(logits.begin(), logits.end()) - logits.begin(); - logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0); - } - } - } - - // set the logit of the eos token (0) to minimum to avoid sampling it - if (file_format == FileFormat::RWKV_1 || - file_format == FileFormat::RWKV_2 || - file_format == FileFormat::NEOX_1 || - file_format == FileFormat::NEOX_2 || - file_format == FileFormat::NEOX_3 || - file_format == FileFormat::NEOX_4 || - file_format == FileFormat::NEOX_5 || - file_format == FileFormat::NEOX_6 || - file_format == FileFormat::NEOX_7 || - file_format == FileFormat::MPT_1) - { - eosID = 0; - int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); - logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0); - } - } + lowestLogit = LowestLogit(logits); + } - if(btsize>0) + if (!unbanTokens && !inputs.unban_tokens_rt) + { + // set the logit of the eos token to very low to avoid sampling it + logitsPtr[eosID] = lowestLogit; + } + if(btsize>0) + { + for(int t=0;t --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' +``` diff --git a/grammars/c.gbnf b/grammars/c.gbnf new file mode 100644 index 0000000000000000000000000000000000000000..4a0331dd2d6df91b6e2154b78dc31df57a4e715e --- /dev/null +++ b/grammars/c.gbnf @@ -0,0 +1,42 @@ +root ::= (declaration)* + +declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}" + +dataType ::= "int" ws | "float" ws | "char" ws +identifier ::= [a-zA-Z_] [a-zA-Z_0-9]* + +parameter ::= dataType identifier + +statement ::= + ( dataType identifier ws "=" ws expression ";" ) | + ( identifier ws "=" ws expression ";" ) | + ( identifier ws "(" argList? ")" ";" ) | + ( "return" ws expression ";" ) | + ( "while" "(" condition ")" "{" statement* "}" ) | + ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) | + ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) | + ( singleLineComment ) | + ( multiLineComment ) + +forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression +forUpdate ::= identifier ws "=" ws expression + +condition ::= expression relationOperator expression +relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">") + +expression ::= term (("+" | "-") term)* +term ::= factor(("*" | "/") factor)* + +factor ::= identifier | number | unaryTerm | funcCall | parenExpression +unaryTerm ::= "-" factor +funcCall ::= identifier "(" argList? ")" +parenExpression ::= "(" ws expression ws ")" + +argList ::= expression ("," ws expression)* + +number ::= [0-9]+ + +singleLineComment ::= "//" [^\n]* "\n" +multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/" + +ws ::= ([ \t\n]+) diff --git a/grammars/json_arr.gbnf b/grammars/json_arr.gbnf new file mode 100644 index 0000000000000000000000000000000000000000..ef53e77a0baddc5c1dbdcaf505c3597338a47677 --- /dev/null +++ b/grammars/json_arr.gbnf @@ -0,0 +1,34 @@ +# This is the same as json.gbnf but we restrict whitespaces at the end of the root array +# Useful for generating JSON arrays + +root ::= arr +value ::= object | array | string | number | ("true" | "false" | "null") ws + +arr ::= + "[\n" ws ( + value + (",\n" ws value)* + )? "]" + +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" ws + +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" ws + +string ::= + "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes + )* "\"" ws + +number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws + +# Optional space: by convention, applied in this grammar after literal chars when allowed +ws ::= ([ \t\n] ws)? diff --git a/k_quants.c b/k_quants.c index 6348fce6b94d031071302b40c10c5113850f473c..62085882df71c4b2e78c687c839e877a850669b5 100644 --- a/k_quants.c +++ b/k_quants.c @@ -13,6 +13,26 @@ // #include +#if !defined(__aarch64__) +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { + int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); + int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); + return vcombine_s16(a0, b0); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} +#endif + #else #ifdef __wasm_simd128__ @@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * float ax = fabsf(x[i]); if (ax > amax) { amax = ax; max = x[i]; } } - if (!amax) { // all zero + if (amax < 1e-30f) { // all zero for (int i = 0; i < n; ++i) { L[i] = 0; } @@ -77,6 +97,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * } return 1/iscale; } + bool return_early = false; + if (rmse_type < 0) { + rmse_type = -rmse_type; + return_early = true; + } int weight_type = rmse_type%2; float sumlx = 0; float suml2 = 0; @@ -89,56 +114,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * suml2 += w*l*l; } float scale = sumlx/suml2; + if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; float best = scale * sumlx; - for (int itry = 0; itry < 3; ++itry) { - iscale = 1/scale; - float slx = 0; - float sl2 = 0; - bool changed = false; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - l = MAX(-nmax, MIN(nmax-1, l)); - if (l + nmax != L[i]) { changed = true; } - float w = weight_type == 1 ? x[i] * x[i] : 1.f; - slx += w*x[i]*l; - sl2 += w*l*l; - } - if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; } - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale * x[i]); - L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); - } - sumlx = slx; suml2 = sl2; - scale = sumlx/suml2; - best = scale * sumlx; - } - for (int itry = 0; itry < 5; ++itry) { - int n_changed = 0; - for (int i = 0; i < n; ++i) { - float w = weight_type == 1 ? x[i]*x[i] : 1; - int l = L[i] - nmax; - float slx = sumlx - w*x[i]*l; - if (slx > 0) { - float sl2 = suml2 - w*l*l; - int new_l = nearest_int(x[i] * sl2 / slx); - new_l = MAX(-nmax, MIN(nmax-1, new_l)); - if (new_l != l) { - slx += w*x[i]*new_l; - sl2 += w*new_l*new_l; - if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) { - L[i] = nmax + new_l; sumlx = slx; suml2 = sl2; - scale = sumlx / suml2; best = scale * sumlx; - ++n_changed; - } - } - } - } - if (!n_changed) { break; } - } - if (rmse_type < 3) { - return scale; - } - for (int is = -4; is <= 4; ++is) { + for (int is = -9; is <= 9; ++is) { if (is == 0) { continue; } @@ -221,7 +199,8 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) { +static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, + int ntry, float alpha) { float min = x[0]; float max = x[0]; for (int i = 1; i < n; ++i) { @@ -254,7 +233,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t for (int i = 0; i < n; ++i) { sum += x[i] - scale*L[i]; } - min = sum/n; + min = alpha*min + (1 - alpha)*sum/n; if (min > 0) min = 0; iscale = 1/scale; if (!did_change) break; @@ -263,6 +242,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } +static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, + uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, + float rmin, float rdelta, int nstep, bool use_mad) { + float min = x[0]; + float max = x[0]; + float sum_w = weights[0]; + float sum_x = sum_w * x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < min) min = x[i]; + if (x[i] > max) max = x[i]; + float w = weights[i]; + sum_w += w; + sum_x += w * x[i]; + } + if (min > 0) min = 0; + if (max == min) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -min; + return 0.f; + } + float iscale = nmax/(max - min); + float scale = 1/iscale; + float best_mad = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + L[i] = MAX(0, MIN(nmax, l)); + float diff = scale * L[i] + min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + best_mad += w * diff; + } + if (nstep < 1) { + *the_min = -min; + return scale; + } + for (int is = 0; is <= nstep; ++is) { + iscale = (rmin + rdelta*is + nmax)/(max - min); + float sum_l = 0, sum_l2 = 0, sum_xl = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + l = MAX(0, MIN(nmax, l)); + Laux[i] = l; + float w = weights[i]; + sum_l += w*l; + sum_l2 += w*l*l; + sum_xl += w*l*x[i]; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; + float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; + if (this_min > 0) { + this_min = 0; + this_scale = sum_xl / sum_l2; + } + float mad = 0; + for (int i = 0; i < n; ++i) { + float diff = this_scale * Laux[i] + this_min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + mad += w * diff; + } + if (mad < best_mad) { + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_mad = mad; + scale = this_scale; + min = this_min; + } + } + } + *the_min = -min; + return scale; +} + #if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { @@ -281,6 +336,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; + uint8_t Laux[16]; + float weights[16]; float mins[QK_K/16]; float scales[QK_K/16]; @@ -291,7 +348,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5); + for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]); + scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -637,6 +695,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; + uint8_t Laux[32]; + float weights[32]; float mins[QK_K/32]; float scales[QK_K/32]; @@ -645,7 +705,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5); + //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -798,6 +863,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; + float weights[32]; + uint8_t Laux[32]; #else int8_t L[QK_K]; float scales[QK_K/16]; @@ -810,7 +877,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5); + //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -1014,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } + if (!max_abs_scale) { + memset(&y[i], 0, sizeof(block_q6_K)); + y[i].d = ggml_fp32_to_fp16(0.f); + x += QK_K; + continue; + } + float iscale = -128.f/max_scale; y[i].d = ggml_fp32_to_fp16(1/iscale); for (int ib = 0; ib < QK_K/16; ++ib) { @@ -1250,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const uint8x16_t m3 = vdupq_n_u8(0x3); const uint8x16_t m4 = vdupq_n_u8(0xF); +#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); +#endif int8x16x2_t q2bytes; uint8_t aux[16]; @@ -1556,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #ifdef __ARM_NEON const uint8x16_t m3 = vdupq_n_u8(0x3); +#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); +#endif int8x16x4_t q2bytes; @@ -2004,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); - uint32_t *aux; + const uint32_t *aux; for (int i = 0; i < nb; ++i) { @@ -2014,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const int8_t * restrict q8 = y[i].qs; // Set up scales - aux = (uint32_t *)x[i].scales; + aux = (const uint32_t *)x[i].scales; __m128i scales128 = _mm_set_epi32( ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), @@ -2526,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri memcpy(utmp, x[i].scales, 12); - const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)}; + uint32x2_t mins8 = { 0 }; + mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0); + mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1); + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[0] &= kmask1; @@ -2540,8 +2626,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; - //int32x4_t isum = mzero; - int32_t sumi1 = 0; int32_t sumi2 = 0; @@ -2638,13 +2722,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; __m256i p16l = _mm256_maddubs_epi16(q4l, q8l); p16l = _mm256_madd_epi16(scale_l, p16l); - sumi = _mm256_add_epi32(sumi, p16l); const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; __m256i p16h = _mm256_maddubs_epi16(q4h, q8h); p16h = _mm256_madd_epi16(scale_h, p16h); - sumi = _mm256_add_epi32(sumi, p16h); + const __m256i sumj = _mm256_add_epi32(p16l, p16h); + sumi = _mm256_add_epi32(sumi, sumj); } __m256 vd = _mm256_set1_ps(d); @@ -3040,9 +3124,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #ifdef __ARM_NEON const uint8x16_t m4b = vdupq_n_u8(0xf); - const int32x4_t mzero = vdupq_n_s32(0); const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mtwo = vdupq_n_u8(2); +#if defined(__ARM_FEATURE_DOTPROD) + const int32x4_t mzero = vdupq_n_s32(0); +#endif int8x16x4_t q5bytes; @@ -3385,8 +3471,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #ifdef __ARM_NEON const uint8x16_t m4b = vdupq_n_u8(0xf); - const int32x4_t mzero = vdupq_n_s32(0); const uint8x16_t mh = vdupq_n_u8(16); +#if defined(__ARM_FEATURE_DOTPROD) + const int32x4_t mzero = vdupq_n_s32(0); +#endif int8x16x4_t q5bytes; uint8x16x4_t q5h; @@ -3604,7 +3692,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); +#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); +#endif //const int8x16_t m32s = vdupq_n_s8(32); const uint8x16_t mone = vdupq_n_u8(3); @@ -3993,8 +4083,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); - const int32x4_t vzero = vdupq_n_s32(0); const int8x16_t m32s = vdupq_n_s8(32); +#if defined(__ARM_FEATURE_DOTPROD) + const int32x4_t vzero = vdupq_n_s32(0); +#endif const uint8x16_t mone = vdupq_n_u8(3); diff --git a/klite.embd b/klite.embd index 153b4af7de47b5d7e0d62ba7308d043b84b5393b..9a4a31f6aa79215343e76a58b6cde84be7765eea 100644 --- a/klite.embd +++ b/klite.embd @@ -1,780 +1,10481 @@ + + - - - + + -KoboldAI Lite - - - - - - - - - - - - - -
-
- -
-
-
-
- -

Connecting...

-
- -
- -
-
-
- - - - - - -
-
- -
- -
-
-
-
-
-
- -
-
- - -
-
- - -
-
-
-
Avoid sending privacy sensitive information. Click here for more info.
-
- -
- - - - - - - - - - - - - - - - - \ No newline at end of file + */ + /*! normalize.css v3.0.3 | MIT License | github.com/necolas/normalize.css */ + /*# sourceMappingURL=bootstrap.min.css.map */ + + html{font-family:sans-serif;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}body{margin:0}article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section,summary{display:block}audio,canvas,progress,video{display:inline-block;vertical-align:baseline}audio:not([controls]){display:none;height:0}[hidden],template{display:none}a{background-color:transparent}a:active,a:hover{outline:0}abbr[title]{border-bottom:none;text-decoration:underline;-webkit-text-decoration:underline dotted;-moz-text-decoration:underline dotted;text-decoration:underline dotted}b,strong{font-weight:700}dfn{font-style:italic}h1{font-size:2em;margin:.67em 0}mark{background:#ff0;color:#000}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}img{border:0}svg:not(:root){overflow:hidden}figure{margin:1em 40px}hr{-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;height:0}pre{overflow:auto}code,kbd,pre,samp{font-family:monospace,monospace;font-size:1em}button,input,optgroup,select,textarea{color:inherit;font:inherit;margin:0}button{overflow:visible}button,select{text-transform:none}button,html input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer}button[disabled],html input[disabled]{cursor:default}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}input{line-height:normal}input[type=checkbox],input[type=radio]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;padding:0}input[type=number]::-webkit-inner-spin-button,input[type=number]::-webkit-outer-spin-button{height:auto}input[type=search]{-webkit-appearance:textfield;-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}fieldset{border:1px solid silver;margin:0 2px;padding:.35em .625em .75em}legend{border:0;padding:0}textarea{overflow:auto}optgroup{font-weight:700}table{border-collapse:collapse;border-spacing:0}td,th{padding:0}/*! Source: https://github.com/h5bp/html5-boilerplate/blob/master/src/css/main.css */@media print{*,:after,:before{color:#000!important;text-shadow:none!important;background:0 0!important;-webkit-box-shadow:none!important;box-shadow:none!important}a,a:visited{text-decoration:underline}a[href]:after{content:" (" attr(href) ")"}abbr[title]:after{content:" (" attr(title) ")"}a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}h2,h3,p{orphans:3;widows:3}h2,h3{page-break-after:avoid}.navbar{display:none}.btn>.caret,.dropup>.btn>.caret{border-top-color:#000!important}.label{border:1px solid #000}.table{border-collapse:collapse!important}.table td,.table th{background-color:#fff!important}.table-bordered td,.table-bordered th{border:1px solid #ddd!important}}@font-face{font-family:"Glyphicons Halflings";src:url(../fonts/glyphicons-halflings-regular.eot);src:url(../fonts/glyphicons-halflings-regular.eot?#iefix) format("embedded-opentype"),url(../fonts/glyphicons-halflings-regular.woff2) format("woff2"),url(../fonts/glyphicons-halflings-regular.woff) format("woff"),url(../fonts/glyphicons-halflings-regular.ttf) format("truetype"),url(../fonts/glyphicons-halflings-regular.svg#glyphicons_halflingsregular) format("svg")}.glyphicon{position:relative;top:1px;display:inline-block;font-family:"Glyphicons Halflings";font-style:normal;font-weight:400;line-height:1;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.glyphicon-asterisk:before{content:"\002a"}.glyphicon-plus:before{content:"\002b"}.glyphicon-eur:before,.glyphicon-euro:before{content:"\20ac"}.glyphicon-minus:before{content:"\2212"}.glyphicon-cloud:before{content:"\2601"}.glyphicon-envelope:before{content:"\2709"}.glyphicon-pencil:before{content:"\270f"}.glyphicon-glass:before{content:"\e001"}.glyphicon-music:before{content:"\e002"}.glyphicon-search:before{content:"\e003"}.glyphicon-heart:before{content:"\e005"}.glyphicon-star:before{content:"\e006"}.glyphicon-star-empty:before{content:"\e007"}.glyphicon-user:before{content:"\e008"}.glyphicon-film:before{content:"\e009"}.glyphicon-th-large:before{content:"\e010"}.glyphicon-th:before{content:"\e011"}.glyphicon-th-list:before{content:"\e012"}.glyphicon-ok:before{content:"\e013"}.glyphicon-remove:before{content:"\e014"}.glyphicon-zoom-in:before{content:"\e015"}.glyphicon-zoom-out:before{content:"\e016"}.glyphicon-off:before{content:"\e017"}.glyphicon-signal:before{content:"\e018"}.glyphicon-cog:before{content:"\e019"}.glyphicon-trash:before{content:"\e020"}.glyphicon-home:before{content:"\e021"}.glyphicon-file:before{content:"\e022"}.glyphicon-time:before{content:"\e023"}.glyphicon-road:before{content:"\e024"}.glyphicon-download-alt:before{content:"\e025"}.glyphicon-download:before{content:"\e026"}.glyphicon-upload:before{content:"\e027"}.glyphicon-inbox:before{content:"\e028"}.glyphicon-play-circle:before{content:"\e029"}.glyphicon-repeat:before{content:"\e030"}.glyphicon-refresh:before{content:"\e031"}.glyphicon-list-alt:before{content:"\e032"}.glyphicon-lock:before{content:"\e033"}.glyphicon-flag:before{content:"\e034"}.glyphicon-headphones:before{content:"\e035"}.glyphicon-volume-off:before{content:"\e036"}.glyphicon-volume-down:before{content:"\e037"}.glyphicon-volume-up:before{content:"\e038"}.glyphicon-qrcode:before{content:"\e039"}.glyphicon-barcode:before{content:"\e040"}.glyphicon-tag:before{content:"\e041"}.glyphicon-tags:before{content:"\e042"}.glyphicon-book:before{content:"\e043"}.glyphicon-bookmark:before{content:"\e044"}.glyphicon-print:before{content:"\e045"}.glyphicon-camera:before{content:"\e046"}.glyphicon-font:before{content:"\e047"}.glyphicon-bold:before{content:"\e048"}.glyphicon-italic:before{content:"\e049"}.glyphicon-text-height:before{content:"\e050"}.glyphicon-text-width:before{content:"\e051"}.glyphicon-align-left:before{content:"\e052"}.glyphicon-align-center:before{content:"\e053"}.glyphicon-align-right:before{content:"\e054"}.glyphicon-align-justify:before{content:"\e055"}.glyphicon-list:before{content:"\e056"}.glyphicon-indent-left:before{content:"\e057"}.glyphicon-indent-right:before{content:"\e058"}.glyphicon-facetime-video:before{content:"\e059"}.glyphicon-picture:before{content:"\e060"}.glyphicon-map-marker:before{content:"\e062"}.glyphicon-adjust:before{content:"\e063"}.glyphicon-tint:before{content:"\e064"}.glyphicon-edit:before{content:"\e065"}.glyphicon-share:before{content:"\e066"}.glyphicon-check:before{content:"\e067"}.glyphicon-move:before{content:"\e068"}.glyphicon-step-backward:before{content:"\e069"}.glyphicon-fast-backward:before{content:"\e070"}.glyphicon-backward:before{content:"\e071"}.glyphicon-play:before{content:"\e072"}.glyphicon-pause:before{content:"\e073"}.glyphicon-stop:before{content:"\e074"}.glyphicon-forward:before{content:"\e075"}.glyphicon-fast-forward:before{content:"\e076"}.glyphicon-step-forward:before{content:"\e077"}.glyphicon-eject:before{content:"\e078"}.glyphicon-chevron-left:before{content:"\e079"}.glyphicon-chevron-right:before{content:"\e080"}.glyphicon-plus-sign:before{content:"\e081"}.glyphicon-minus-sign:before{content:"\e082"}.glyphicon-remove-sign:before{content:"\e083"}.glyphicon-ok-sign:before{content:"\e084"}.glyphicon-question-sign:before{content:"\e085"}.glyphicon-info-sign:before{content:"\e086"}.glyphicon-screenshot:before{content:"\e087"}.glyphicon-remove-circle:before{content:"\e088"}.glyphicon-ok-circle:before{content:"\e089"}.glyphicon-ban-circle:before{content:"\e090"}.glyphicon-arrow-left:before{content:"\e091"}.glyphicon-arrow-right:before{content:"\e092"}.glyphicon-arrow-up:before{content:"\e093"}.glyphicon-arrow-down:before{content:"\e094"}.glyphicon-share-alt:before{content:"\e095"}.glyphicon-resize-full:before{content:"\e096"}.glyphicon-resize-small:before{content:"\e097"}.glyphicon-exclamation-sign:before{content:"\e101"}.glyphicon-gift:before{content:"\e102"}.glyphicon-leaf:before{content:"\e103"}.glyphicon-fire:before{content:"\e104"}.glyphicon-eye-open:before{content:"\e105"}.glyphicon-eye-close:before{content:"\e106"}.glyphicon-warning-sign:before{content:"\e107"}.glyphicon-plane:before{content:"\e108"}.glyphicon-calendar:before{content:"\e109"}.glyphicon-random:before{content:"\e110"}.glyphicon-comment:before{content:"\e111"}.glyphicon-magnet:before{content:"\e112"}.glyphicon-chevron-up:before{content:"\e113"}.glyphicon-chevron-down:before{content:"\e114"}.glyphicon-retweet:before{content:"\e115"}.glyphicon-shopping-cart:before{content:"\e116"}.glyphicon-folder-close:before{content:"\e117"}.glyphicon-folder-open:before{content:"\e118"}.glyphicon-resize-vertical:before{content:"\e119"}.glyphicon-resize-horizontal:before{content:"\e120"}.glyphicon-hdd:before{content:"\e121"}.glyphicon-bullhorn:before{content:"\e122"}.glyphicon-bell:before{content:"\e123"}.glyphicon-certificate:before{content:"\e124"}.glyphicon-thumbs-up:before{content:"\e125"}.glyphicon-thumbs-down:before{content:"\e126"}.glyphicon-hand-right:before{content:"\e127"}.glyphicon-hand-left:before{content:"\e128"}.glyphicon-hand-up:before{content:"\e129"}.glyphicon-hand-down:before{content:"\e130"}.glyphicon-circle-arrow-right:before{content:"\e131"}.glyphicon-circle-arrow-left:before{content:"\e132"}.glyphicon-circle-arrow-up:before{content:"\e133"}.glyphicon-circle-arrow-down:before{content:"\e134"}.glyphicon-globe:before{content:"\e135"}.glyphicon-wrench:before{content:"\e136"}.glyphicon-tasks:before{content:"\e137"}.glyphicon-filter:before{content:"\e138"}.glyphicon-briefcase:before{content:"\e139"}.glyphicon-fullscreen:before{content:"\e140"}.glyphicon-dashboard:before{content:"\e141"}.glyphicon-paperclip:before{content:"\e142"}.glyphicon-heart-empty:before{content:"\e143"}.glyphicon-link:before{content:"\e144"}.glyphicon-phone:before{content:"\e145"}.glyphicon-pushpin:before{content:"\e146"}.glyphicon-usd:before{content:"\e148"}.glyphicon-gbp:before{content:"\e149"}.glyphicon-sort:before{content:"\e150"}.glyphicon-sort-by-alphabet:before{content:"\e151"}.glyphicon-sort-by-alphabet-alt:before{content:"\e152"}.glyphicon-sort-by-order:before{content:"\e153"}.glyphicon-sort-by-order-alt:before{content:"\e154"}.glyphicon-sort-by-attributes:before{content:"\e155"}.glyphicon-sort-by-attributes-alt:before{content:"\e156"}.glyphicon-unchecked:before{content:"\e157"}.glyphicon-expand:before{content:"\e158"}.glyphicon-collapse-down:before{content:"\e159"}.glyphicon-collapse-up:before{content:"\e160"}.glyphicon-log-in:before{content:"\e161"}.glyphicon-flash:before{content:"\e162"}.glyphicon-log-out:before{content:"\e163"}.glyphicon-new-window:before{content:"\e164"}.glyphicon-record:before{content:"\e165"}.glyphicon-save:before{content:"\e166"}.glyphicon-open:before{content:"\e167"}.glyphicon-saved:before{content:"\e168"}.glyphicon-import:before{content:"\e169"}.glyphicon-export:before{content:"\e170"}.glyphicon-send:before{content:"\e171"}.glyphicon-floppy-disk:before{content:"\e172"}.glyphicon-floppy-saved:before{content:"\e173"}.glyphicon-floppy-remove:before{content:"\e174"}.glyphicon-floppy-save:before{content:"\e175"}.glyphicon-floppy-open:before{content:"\e176"}.glyphicon-credit-card:before{content:"\e177"}.glyphicon-transfer:before{content:"\e178"}.glyphicon-cutlery:before{content:"\e179"}.glyphicon-header:before{content:"\e180"}.glyphicon-compressed:before{content:"\e181"}.glyphicon-earphone:before{content:"\e182"}.glyphicon-phone-alt:before{content:"\e183"}.glyphicon-tower:before{content:"\e184"}.glyphicon-stats:before{content:"\e185"}.glyphicon-sd-video:before{content:"\e186"}.glyphicon-hd-video:before{content:"\e187"}.glyphicon-subtitles:before{content:"\e188"}.glyphicon-sound-stereo:before{content:"\e189"}.glyphicon-sound-dolby:before{content:"\e190"}.glyphicon-sound-5-1:before{content:"\e191"}.glyphicon-sound-6-1:before{content:"\e192"}.glyphicon-sound-7-1:before{content:"\e193"}.glyphicon-copyright-mark:before{content:"\e194"}.glyphicon-registration-mark:before{content:"\e195"}.glyphicon-cloud-download:before{content:"\e197"}.glyphicon-cloud-upload:before{content:"\e198"}.glyphicon-tree-conifer:before{content:"\e199"}.glyphicon-tree-deciduous:before{content:"\e200"}.glyphicon-cd:before{content:"\e201"}.glyphicon-save-file:before{content:"\e202"}.glyphicon-open-file:before{content:"\e203"}.glyphicon-level-up:before{content:"\e204"}.glyphicon-copy:before{content:"\e205"}.glyphicon-paste:before{content:"\e206"}.glyphicon-alert:before{content:"\e209"}.glyphicon-equalizer:before{content:"\e210"}.glyphicon-king:before{content:"\e211"}.glyphicon-queen:before{content:"\e212"}.glyphicon-pawn:before{content:"\e213"}.glyphicon-bishop:before{content:"\e214"}.glyphicon-knight:before{content:"\e215"}.glyphicon-baby-formula:before{content:"\e216"}.glyphicon-tent:before{content:"\26fa"}.glyphicon-blackboard:before{content:"\e218"}.glyphicon-bed:before{content:"\e219"}.glyphicon-apple:before{content:"\f8ff"}.glyphicon-erase:before{content:"\e221"}.glyphicon-hourglass:before{content:"\231b"}.glyphicon-lamp:before{content:"\e223"}.glyphicon-duplicate:before{content:"\e224"}.glyphicon-piggy-bank:before{content:"\e225"}.glyphicon-scissors:before{content:"\e226"}.glyphicon-bitcoin:before{content:"\e227"}.glyphicon-btc:before{content:"\e227"}.glyphicon-xbt:before{content:"\e227"}.glyphicon-yen:before{content:"\00a5"}.glyphicon-jpy:before{content:"\00a5"}.glyphicon-ruble:before{content:"\20bd"}.glyphicon-rub:before{content:"\20bd"}.glyphicon-scale:before{content:"\e230"}.glyphicon-ice-lolly:before{content:"\e231"}.glyphicon-ice-lolly-tasted:before{content:"\e232"}.glyphicon-education:before{content:"\e233"}.glyphicon-option-horizontal:before{content:"\e234"}.glyphicon-option-vertical:before{content:"\e235"}.glyphicon-menu-hamburger:before{content:"\e236"}.glyphicon-modal-window:before{content:"\e237"}.glyphicon-oil:before{content:"\e238"}.glyphicon-grain:before{content:"\e239"}.glyphicon-sunglasses:before{content:"\e240"}.glyphicon-text-size:before{content:"\e241"}.glyphicon-text-color:before{content:"\e242"}.glyphicon-text-background:before{content:"\e243"}.glyphicon-object-align-top:before{content:"\e244"}.glyphicon-object-align-bottom:before{content:"\e245"}.glyphicon-object-align-horizontal:before{content:"\e246"}.glyphicon-object-align-left:before{content:"\e247"}.glyphicon-object-align-vertical:before{content:"\e248"}.glyphicon-object-align-right:before{content:"\e249"}.glyphicon-triangle-right:before{content:"\e250"}.glyphicon-triangle-left:before{content:"\e251"}.glyphicon-triangle-bottom:before{content:"\e252"}.glyphicon-triangle-top:before{content:"\e253"}.glyphicon-console:before{content:"\e254"}.glyphicon-superscript:before{content:"\e255"}.glyphicon-subscript:before{content:"\e256"}.glyphicon-menu-left:before{content:"\e257"}.glyphicon-menu-right:before{content:"\e258"}.glyphicon-menu-down:before{content:"\e259"}.glyphicon-menu-up:before{content:"\e260"}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}:after,:before{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:10px;-webkit-tap-highlight-color:rgba(0,0,0,0)}body{font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-size:14px;line-height:1.42857143;color:#333;background-color:#fff}button,input,select,textarea{font-family:inherit;font-size:inherit;line-height:inherit}a{color:#337ab7;text-decoration:none}a:focus,a:hover{color:#23527c;text-decoration:underline}a:focus{outline:5px auto -webkit-focus-ring-color;outline-offset:-2px}figure{margin:0}img{vertical-align:middle}.carousel-inner>.item>a>img,.carousel-inner>.item>img,.img-responsive,.thumbnail a>img,.thumbnail>img{display:block;max-width:100%;height:auto}.img-rounded{border-radius:6px}.img-thumbnail{padding:4px;line-height:1.42857143;background-color:#fff;border:1px solid #ddd;border-radius:4px;-webkit-transition:all .2s ease-in-out;-o-transition:all .2s ease-in-out;transition:all .2s ease-in-out;display:inline-block;max-width:100%;height:auto}.img-circle{border-radius:50%}hr{margin-top:20px;margin-bottom:20px;border:0;border-top:1px solid #eee}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}[role=button]{cursor:pointer}.h1,.h2,.h3,.h4,.h5,.h6,h1,h2,h3,h4,h5,h6{font-family:inherit;font-weight:500;line-height:1.1;color:inherit}.h1 .small,.h1 small,.h2 .small,.h2 small,.h3 .small,.h3 small,.h4 .small,.h4 small,.h5 .small,.h5 small,.h6 .small,.h6 small,h1 .small,h1 small,h2 .small,h2 small,h3 .small,h3 small,h4 .small,h4 small,h5 .small,h5 small,h6 .small,h6 small{font-weight:400;line-height:1;color:#777}.h1,.h2,.h3,h1,h2,h3{margin-top:20px;margin-bottom:10px}.h1 .small,.h1 small,.h2 .small,.h2 small,.h3 .small,.h3 small,h1 .small,h1 small,h2 .small,h2 small,h3 .small,h3 small{font-size:65%}.h4,.h5,.h6,h4,h5,h6{margin-top:10px;margin-bottom:10px}.h4 .small,.h4 small,.h5 .small,.h5 small,.h6 .small,.h6 small,h4 .small,h4 small,h5 .small,h5 small,h6 .small,h6 small{font-size:75%}.h1,h1{font-size:36px}.h2,h2{font-size:30px}.h3,h3{font-size:24px}.h4,h4{font-size:18px}.h5,h5{font-size:14px}.h6,h6{font-size:12px}p{margin:0 0 10px}.lead{margin-bottom:20px;font-size:16px;font-weight:300;line-height:1.4}@media (min-width:768px){.lead{font-size:21px}}.small,small{font-size:85%}.mark,mark{padding:.2em;background-color:#fcf8e3}.text-left{text-align:left}.text-right{text-align:right}.text-center{text-align:center}.text-justify{text-align:justify}.text-nowrap{white-space:nowrap}.text-lowercase{text-transform:lowercase}.text-uppercase{text-transform:uppercase}.text-capitalize{text-transform:capitalize}.text-muted{color:#777}.text-primary{color:#337ab7}a.text-primary:focus,a.text-primary:hover{color:#286090}.text-success{color:#3c763d}a.text-success:focus,a.text-success:hover{color:#2b542c}.text-info{color:#31708f}a.text-info:focus,a.text-info:hover{color:#245269}.text-warning{color:#8a6d3b}a.text-warning:focus,a.text-warning:hover{color:#66512c}.text-danger{color:#a94442}a.text-danger:focus,a.text-danger:hover{color:#843534}.bg-primary{color:#fff;background-color:#337ab7}a.bg-primary:focus,a.bg-primary:hover{background-color:#286090}.bg-success{background-color:#dff0d8}a.bg-success:focus,a.bg-success:hover{background-color:#c1e2b3}.bg-info{background-color:#d9edf7}a.bg-info:focus,a.bg-info:hover{background-color:#afd9ee}.bg-warning{background-color:#fcf8e3}a.bg-warning:focus,a.bg-warning:hover{background-color:#f7ecb5}.bg-danger{background-color:#f2dede}a.bg-danger:focus,a.bg-danger:hover{background-color:#e4b9b9}.page-header{padding-bottom:9px;margin:40px 0 20px;border-bottom:1px solid #eee}ol,ul{margin-top:0;margin-bottom:10px}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}.list-unstyled{padding-left:0;list-style:none}.list-inline{padding-left:0;list-style:none;margin-left:-5px}.list-inline>li{display:inline-block;padding-right:5px;padding-left:5px}dl{margin-top:0;margin-bottom:20px}dd,dt{line-height:1.42857143}dt{font-weight:700}dd{margin-left:0}@media (min-width:768px){.dl-horizontal dt{float:left;width:160px;clear:left;text-align:right;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.dl-horizontal dd{margin-left:180px}}abbr[data-original-title],abbr[title]{cursor:help}.initialism{font-size:90%;text-transform:uppercase}blockquote{padding:10px 20px;margin:0 0 20px;font-size:17.5px;border-left:5px solid #eee}blockquote ol:last-child,blockquote p:last-child,blockquote ul:last-child{margin-bottom:0}blockquote .small,blockquote footer,blockquote small{display:block;font-size:80%;line-height:1.42857143;color:#777}blockquote .small:before,blockquote footer:before,blockquote small:before{content:"\2014 \00A0"}.blockquote-reverse,blockquote.pull-right{padding-right:15px;padding-left:0;text-align:right;border-right:5px solid #eee;border-left:0}.blockquote-reverse .small:before,.blockquote-reverse footer:before,.blockquote-reverse small:before,blockquote.pull-right .small:before,blockquote.pull-right footer:before,blockquote.pull-right small:before{content:""}.blockquote-reverse .small:after,.blockquote-reverse footer:after,.blockquote-reverse small:after,blockquote.pull-right .small:after,blockquote.pull-right footer:after,blockquote.pull-right small:after{content:"\00A0 \2014"}address{margin-bottom:20px;font-style:normal;line-height:1.42857143}code,kbd,pre,samp{font-family:Menlo,Monaco,Consolas,"Courier New",monospace}code{padding:2px 4px;font-size:90%;color:#c7254e;background-color:#f9f2f4;border-radius:4px}kbd{padding:2px 4px;font-size:90%;color:#fff;background-color:#333;border-radius:3px;-webkit-box-shadow:inset 0 -1px 0 rgba(0,0,0,.25);box-shadow:inset 0 -1px 0 rgba(0,0,0,.25)}kbd kbd{padding:0;font-size:100%;font-weight:700;-webkit-box-shadow:none;box-shadow:none}pre{display:block;padding:9.5px;margin:0 0 10px;font-size:13px;line-height:1.42857143;color:#333;word-break:break-all;word-wrap:break-word;background-color:#f5f5f5;border:1px solid #ccc;border-radius:4px}pre code{padding:0;font-size:inherit;color:inherit;white-space:pre-wrap;background-color:transparent;border-radius:0}.pre-scrollable{max-height:340px;overflow-y:scroll}.container{padding-right:15px;padding-left:15px;margin-right:auto;margin-left:auto}@media (min-width:768px){.container{width:750px}}@media (min-width:992px){.container{width:970px}}@media (min-width:1200px){.container{width:1170px}}.container-fluid{padding-right:15px;padding-left:15px;margin-right:auto;margin-left:auto}.row{margin-right:-15px;margin-left:-15px}.row-no-gutters{margin-right:0;margin-left:0}.row-no-gutters [class*=col-]{padding-right:0;padding-left:0}.col-lg-1,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-md-1,.col-md-10,.col-md-11,.col-md-12,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-sm-1,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-xs-1,.col-xs-10,.col-xs-11,.col-xs-12,.col-xs-2,.col-xs-3,.col-xs-4,.col-xs-5,.col-xs-6,.col-xs-7,.col-xs-8,.col-xs-9{position:relative;min-height:1px;padding-right:15px;padding-left:15px}.col-xs-1,.col-xs-10,.col-xs-11,.col-xs-12,.col-xs-2,.col-xs-3,.col-xs-4,.col-xs-5,.col-xs-6,.col-xs-7,.col-xs-8,.col-xs-9{float:left}.col-xs-12{width:100%}.col-xs-11{width:91.66666667%}.col-xs-10{width:83.33333333%}.col-xs-9{width:75%}.col-xs-8{width:66.66666667%}.col-xs-7{width:58.33333333%}.col-xs-6{width:50%}.col-xs-5{width:41.66666667%}.col-xs-4{width:33.33333333%}.col-xs-3{width:25%}.col-xs-2{width:16.66666667%}.col-xs-1{width:8.33333333%}.col-xs-pull-12{right:100%}.col-xs-pull-11{right:91.66666667%}.col-xs-pull-10{right:83.33333333%}.col-xs-pull-9{right:75%}.col-xs-pull-8{right:66.66666667%}.col-xs-pull-7{right:58.33333333%}.col-xs-pull-6{right:50%}.col-xs-pull-5{right:41.66666667%}.col-xs-pull-4{right:33.33333333%}.col-xs-pull-3{right:25%}.col-xs-pull-2{right:16.66666667%}.col-xs-pull-1{right:8.33333333%}.col-xs-pull-0{right:auto}.col-xs-push-12{left:100%}.col-xs-push-11{left:91.66666667%}.col-xs-push-10{left:83.33333333%}.col-xs-push-9{left:75%}.col-xs-push-8{left:66.66666667%}.col-xs-push-7{left:58.33333333%}.col-xs-push-6{left:50%}.col-xs-push-5{left:41.66666667%}.col-xs-push-4{left:33.33333333%}.col-xs-push-3{left:25%}.col-xs-push-2{left:16.66666667%}.col-xs-push-1{left:8.33333333%}.col-xs-push-0{left:auto}.col-xs-offset-12{margin-left:100%}.col-xs-offset-11{margin-left:91.66666667%}.col-xs-offset-10{margin-left:83.33333333%}.col-xs-offset-9{margin-left:75%}.col-xs-offset-8{margin-left:66.66666667%}.col-xs-offset-7{margin-left:58.33333333%}.col-xs-offset-6{margin-left:50%}.col-xs-offset-5{margin-left:41.66666667%}.col-xs-offset-4{margin-left:33.33333333%}.col-xs-offset-3{margin-left:25%}.col-xs-offset-2{margin-left:16.66666667%}.col-xs-offset-1{margin-left:8.33333333%}.col-xs-offset-0{margin-left:0}@media (min-width:768px){.col-sm-1,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9{float:left}.col-sm-12{width:100%}.col-sm-11{width:91.66666667%}.col-sm-10{width:83.33333333%}.col-sm-9{width:75%}.col-sm-8{width:66.66666667%}.col-sm-7{width:58.33333333%}.col-sm-6{width:50%}.col-sm-5{width:41.66666667%}.col-sm-4{width:33.33333333%}.col-sm-3{width:25%}.col-sm-2{width:16.66666667%}.col-sm-1{width:8.33333333%}.col-sm-pull-12{right:100%}.col-sm-pull-11{right:91.66666667%}.col-sm-pull-10{right:83.33333333%}.col-sm-pull-9{right:75%}.col-sm-pull-8{right:66.66666667%}.col-sm-pull-7{right:58.33333333%}.col-sm-pull-6{right:50%}.col-sm-pull-5{right:41.66666667%}.col-sm-pull-4{right:33.33333333%}.col-sm-pull-3{right:25%}.col-sm-pull-2{right:16.66666667%}.col-sm-pull-1{right:8.33333333%}.col-sm-pull-0{right:auto}.col-sm-push-12{left:100%}.col-sm-push-11{left:91.66666667%}.col-sm-push-10{left:83.33333333%}.col-sm-push-9{left:75%}.col-sm-push-8{left:66.66666667%}.col-sm-push-7{left:58.33333333%}.col-sm-push-6{left:50%}.col-sm-push-5{left:41.66666667%}.col-sm-push-4{left:33.33333333%}.col-sm-push-3{left:25%}.col-sm-push-2{left:16.66666667%}.col-sm-push-1{left:8.33333333%}.col-sm-push-0{left:auto}.col-sm-offset-12{margin-left:100%}.col-sm-offset-11{margin-left:91.66666667%}.col-sm-offset-10{margin-left:83.33333333%}.col-sm-offset-9{margin-left:75%}.col-sm-offset-8{margin-left:66.66666667%}.col-sm-offset-7{margin-left:58.33333333%}.col-sm-offset-6{margin-left:50%}.col-sm-offset-5{margin-left:41.66666667%}.col-sm-offset-4{margin-left:33.33333333%}.col-sm-offset-3{margin-left:25%}.col-sm-offset-2{margin-left:16.66666667%}.col-sm-offset-1{margin-left:8.33333333%}.col-sm-offset-0{margin-left:0}}@media (min-width:992px){.col-md-1,.col-md-10,.col-md-11,.col-md-12,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9{float:left}.col-md-12{width:100%}.col-md-11{width:91.66666667%}.col-md-10{width:83.33333333%}.col-md-9{width:75%}.col-md-8{width:66.66666667%}.col-md-7{width:58.33333333%}.col-md-6{width:50%}.col-md-5{width:41.66666667%}.col-md-4{width:33.33333333%}.col-md-3{width:25%}.col-md-2{width:16.66666667%}.col-md-1{width:8.33333333%}.col-md-pull-12{right:100%}.col-md-pull-11{right:91.66666667%}.col-md-pull-10{right:83.33333333%}.col-md-pull-9{right:75%}.col-md-pull-8{right:66.66666667%}.col-md-pull-7{right:58.33333333%}.col-md-pull-6{right:50%}.col-md-pull-5{right:41.66666667%}.col-md-pull-4{right:33.33333333%}.col-md-pull-3{right:25%}.col-md-pull-2{right:16.66666667%}.col-md-pull-1{right:8.33333333%}.col-md-pull-0{right:auto}.col-md-push-12{left:100%}.col-md-push-11{left:91.66666667%}.col-md-push-10{left:83.33333333%}.col-md-push-9{left:75%}.col-md-push-8{left:66.66666667%}.col-md-push-7{left:58.33333333%}.col-md-push-6{left:50%}.col-md-push-5{left:41.66666667%}.col-md-push-4{left:33.33333333%}.col-md-push-3{left:25%}.col-md-push-2{left:16.66666667%}.col-md-push-1{left:8.33333333%}.col-md-push-0{left:auto}.col-md-offset-12{margin-left:100%}.col-md-offset-11{margin-left:91.66666667%}.col-md-offset-10{margin-left:83.33333333%}.col-md-offset-9{margin-left:75%}.col-md-offset-8{margin-left:66.66666667%}.col-md-offset-7{margin-left:58.33333333%}.col-md-offset-6{margin-left:50%}.col-md-offset-5{margin-left:41.66666667%}.col-md-offset-4{margin-left:33.33333333%}.col-md-offset-3{margin-left:25%}.col-md-offset-2{margin-left:16.66666667%}.col-md-offset-1{margin-left:8.33333333%}.col-md-offset-0{margin-left:0}}@media (min-width:1200px){.col-lg-1,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9{float:left}.col-lg-12{width:100%}.col-lg-11{width:91.66666667%}.col-lg-10{width:83.33333333%}.col-lg-9{width:75%}.col-lg-8{width:66.66666667%}.col-lg-7{width:58.33333333%}.col-lg-6{width:50%}.col-lg-5{width:41.66666667%}.col-lg-4{width:33.33333333%}.col-lg-3{width:25%}.col-lg-2{width:16.66666667%}.col-lg-1{width:8.33333333%}.col-lg-pull-12{right:100%}.col-lg-pull-11{right:91.66666667%}.col-lg-pull-10{right:83.33333333%}.col-lg-pull-9{right:75%}.col-lg-pull-8{right:66.66666667%}.col-lg-pull-7{right:58.33333333%}.col-lg-pull-6{right:50%}.col-lg-pull-5{right:41.66666667%}.col-lg-pull-4{right:33.33333333%}.col-lg-pull-3{right:25%}.col-lg-pull-2{right:16.66666667%}.col-lg-pull-1{right:8.33333333%}.col-lg-pull-0{right:auto}.col-lg-push-12{left:100%}.col-lg-push-11{left:91.66666667%}.col-lg-push-10{left:83.33333333%}.col-lg-push-9{left:75%}.col-lg-push-8{left:66.66666667%}.col-lg-push-7{left:58.33333333%}.col-lg-push-6{left:50%}.col-lg-push-5{left:41.66666667%}.col-lg-push-4{left:33.33333333%}.col-lg-push-3{left:25%}.col-lg-push-2{left:16.66666667%}.col-lg-push-1{left:8.33333333%}.col-lg-push-0{left:auto}.col-lg-offset-12{margin-left:100%}.col-lg-offset-11{margin-left:91.66666667%}.col-lg-offset-10{margin-left:83.33333333%}.col-lg-offset-9{margin-left:75%}.col-lg-offset-8{margin-left:66.66666667%}.col-lg-offset-7{margin-left:58.33333333%}.col-lg-offset-6{margin-left:50%}.col-lg-offset-5{margin-left:41.66666667%}.col-lg-offset-4{margin-left:33.33333333%}.col-lg-offset-3{margin-left:25%}.col-lg-offset-2{margin-left:16.66666667%}.col-lg-offset-1{margin-left:8.33333333%}.col-lg-offset-0{margin-left:0}}table{background-color:transparent}table col[class*=col-]{position:static;display:table-column;float:none}table td[class*=col-],table th[class*=col-]{position:static;display:table-cell;float:none}caption{padding-top:8px;padding-bottom:8px;color:#777;text-align:left}th{text-align:left}.table{width:100%;max-width:100%;margin-bottom:20px}.table>tbody>tr>td,.table>tbody>tr>th,.table>tfoot>tr>td,.table>tfoot>tr>th,.table>thead>tr>td,.table>thead>tr>th{padding:8px;line-height:1.42857143;vertical-align:top;border-top:1px solid #ddd}.table>thead>tr>th{vertical-align:bottom;border-bottom:2px solid #ddd}.table>caption+thead>tr:first-child>td,.table>caption+thead>tr:first-child>th,.table>colgroup+thead>tr:first-child>td,.table>colgroup+thead>tr:first-child>th,.table>thead:first-child>tr:first-child>td,.table>thead:first-child>tr:first-child>th{border-top:0}.table>tbody+tbody{border-top:2px solid #ddd}.table .table{background-color:#fff}.table-condensed>tbody>tr>td,.table-condensed>tbody>tr>th,.table-condensed>tfoot>tr>td,.table-condensed>tfoot>tr>th,.table-condensed>thead>tr>td,.table-condensed>thead>tr>th{padding:5px}.table-bordered{border:1px solid #ddd}.table-bordered>tbody>tr>td,.table-bordered>tbody>tr>th,.table-bordered>tfoot>tr>td,.table-bordered>tfoot>tr>th,.table-bordered>thead>tr>td,.table-bordered>thead>tr>th{border:1px solid #ddd}.table-bordered>thead>tr>td,.table-bordered>thead>tr>th{border-bottom-width:2px}.table-striped>tbody>tr:nth-of-type(odd){background-color:#f9f9f9}.table-hover>tbody>tr:hover{background-color:#f5f5f5}.table>tbody>tr.active>td,.table>tbody>tr.active>th,.table>tbody>tr>td.active,.table>tbody>tr>th.active,.table>tfoot>tr.active>td,.table>tfoot>tr.active>th,.table>tfoot>tr>td.active,.table>tfoot>tr>th.active,.table>thead>tr.active>td,.table>thead>tr.active>th,.table>thead>tr>td.active,.table>thead>tr>th.active{background-color:#f5f5f5}.table-hover>tbody>tr.active:hover>td,.table-hover>tbody>tr.active:hover>th,.table-hover>tbody>tr:hover>.active,.table-hover>tbody>tr>td.active:hover,.table-hover>tbody>tr>th.active:hover{background-color:#e8e8e8}.table>tbody>tr.success>td,.table>tbody>tr.success>th,.table>tbody>tr>td.success,.table>tbody>tr>th.success,.table>tfoot>tr.success>td,.table>tfoot>tr.success>th,.table>tfoot>tr>td.success,.table>tfoot>tr>th.success,.table>thead>tr.success>td,.table>thead>tr.success>th,.table>thead>tr>td.success,.table>thead>tr>th.success{background-color:#dff0d8}.table-hover>tbody>tr.success:hover>td,.table-hover>tbody>tr.success:hover>th,.table-hover>tbody>tr:hover>.success,.table-hover>tbody>tr>td.success:hover,.table-hover>tbody>tr>th.success:hover{background-color:#d0e9c6}.table>tbody>tr.info>td,.table>tbody>tr.info>th,.table>tbody>tr>td.info,.table>tbody>tr>th.info,.table>tfoot>tr.info>td,.table>tfoot>tr.info>th,.table>tfoot>tr>td.info,.table>tfoot>tr>th.info,.table>thead>tr.info>td,.table>thead>tr.info>th,.table>thead>tr>td.info,.table>thead>tr>th.info{background-color:#d9edf7}.table-hover>tbody>tr.info:hover>td,.table-hover>tbody>tr.info:hover>th,.table-hover>tbody>tr:hover>.info,.table-hover>tbody>tr>td.info:hover,.table-hover>tbody>tr>th.info:hover{background-color:#c4e3f3}.table>tbody>tr.warning>td,.table>tbody>tr.warning>th,.table>tbody>tr>td.warning,.table>tbody>tr>th.warning,.table>tfoot>tr.warning>td,.table>tfoot>tr.warning>th,.table>tfoot>tr>td.warning,.table>tfoot>tr>th.warning,.table>thead>tr.warning>td,.table>thead>tr.warning>th,.table>thead>tr>td.warning,.table>thead>tr>th.warning{background-color:#fcf8e3}.table-hover>tbody>tr.warning:hover>td,.table-hover>tbody>tr.warning:hover>th,.table-hover>tbody>tr:hover>.warning,.table-hover>tbody>tr>td.warning:hover,.table-hover>tbody>tr>th.warning:hover{background-color:#faf2cc}.table>tbody>tr.danger>td,.table>tbody>tr.danger>th,.table>tbody>tr>td.danger,.table>tbody>tr>th.danger,.table>tfoot>tr.danger>td,.table>tfoot>tr.danger>th,.table>tfoot>tr>td.danger,.table>tfoot>tr>th.danger,.table>thead>tr.danger>td,.table>thead>tr.danger>th,.table>thead>tr>td.danger,.table>thead>tr>th.danger{background-color:#f2dede}.table-hover>tbody>tr.danger:hover>td,.table-hover>tbody>tr.danger:hover>th,.table-hover>tbody>tr:hover>.danger,.table-hover>tbody>tr>td.danger:hover,.table-hover>tbody>tr>th.danger:hover{background-color:#ebcccc}.table-responsive{min-height:.01%;overflow-x:auto}@media screen and (max-width:767px){.table-responsive{width:100%;margin-bottom:15px;overflow-y:hidden;-ms-overflow-style:-ms-autohiding-scrollbar;border:1px solid #ddd}.table-responsive>.table{margin-bottom:0}.table-responsive>.table>tbody>tr>td,.table-responsive>.table>tbody>tr>th,.table-responsive>.table>tfoot>tr>td,.table-responsive>.table>tfoot>tr>th,.table-responsive>.table>thead>tr>td,.table-responsive>.table>thead>tr>th{white-space:nowrap}.table-responsive>.table-bordered{border:0}.table-responsive>.table-bordered>tbody>tr>td:first-child,.table-responsive>.table-bordered>tbody>tr>th:first-child,.table-responsive>.table-bordered>tfoot>tr>td:first-child,.table-responsive>.table-bordered>tfoot>tr>th:first-child,.table-responsive>.table-bordered>thead>tr>td:first-child,.table-responsive>.table-bordered>thead>tr>th:first-child{border-left:0}.table-responsive>.table-bordered>tbody>tr>td:last-child,.table-responsive>.table-bordered>tbody>tr>th:last-child,.table-responsive>.table-bordered>tfoot>tr>td:last-child,.table-responsive>.table-bordered>tfoot>tr>th:last-child,.table-responsive>.table-bordered>thead>tr>td:last-child,.table-responsive>.table-bordered>thead>tr>th:last-child{border-right:0}.table-responsive>.table-bordered>tbody>tr:last-child>td,.table-responsive>.table-bordered>tbody>tr:last-child>th,.table-responsive>.table-bordered>tfoot>tr:last-child>td,.table-responsive>.table-bordered>tfoot>tr:last-child>th{border-bottom:0}}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;padding:0;margin-bottom:20px;font-size:21px;line-height:inherit;color:#333;border:0;border-bottom:1px solid #e5e5e5}label{display:inline-block;max-width:100%;margin-bottom:5px;font-weight:700}input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;-webkit-appearance:none;-moz-appearance:none;appearance:none}input[type=checkbox],input[type=radio]{margin:4px 0 0;margin-top:1px\9;line-height:normal}fieldset[disabled] input[type=checkbox],fieldset[disabled] input[type=radio],input[type=checkbox].disabled,input[type=checkbox][disabled],input[type=radio].disabled,input[type=radio][disabled]{cursor:not-allowed}input[type=file]{display:block}input[type=range]{display:block;width:100%}select[multiple],select[size]{height:auto}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:5px auto -webkit-focus-ring-color;outline-offset:-2px}output{display:block;padding-top:7px;font-size:14px;line-height:1.42857143;color:#555}.form-control{display:block;width:100%;height:34px;padding:6px 12px;font-size:14px;line-height:1.42857143;color:#555;background-color:#fff;background-image:none;border:1px solid #ccc;border-radius:4px;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 1px rgba(0,0,0,.075);-webkit-transition:border-color ease-in-out .15s,box-shadow ease-in-out .15s;-o-transition:border-color ease-in-out .15s,box-shadow ease-in-out .15s;-webkit-transition:border-color ease-in-out .15s,-webkit-box-shadow ease-in-out .15s;transition:border-color ease-in-out .15s,-webkit-box-shadow ease-in-out .15s;transition:border-color ease-in-out .15s,box-shadow ease-in-out .15s;transition:border-color ease-in-out .15s,box-shadow ease-in-out .15s,-webkit-box-shadow ease-in-out .15s}.form-control:focus{border-color:#66afe9;outline:0;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(102,175,233,.6);box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(102,175,233,.6)}.form-control::-moz-placeholder{color:#999;opacity:1}.form-control:-ms-input-placeholder{color:#999}.form-control::-webkit-input-placeholder{color:#999}.form-control::-ms-expand{background-color:transparent;border:0}.form-control[disabled],.form-control[readonly],fieldset[disabled] .form-control{background-color:#eee;opacity:1}.form-control[disabled],fieldset[disabled] .form-control{cursor:not-allowed}textarea.form-control{height:auto}@media screen and (-webkit-min-device-pixel-ratio:0){input[type=date].form-control,input[type=datetime-local].form-control,input[type=month].form-control,input[type=time].form-control{line-height:34px}.input-group-sm input[type=date],.input-group-sm input[type=datetime-local],.input-group-sm input[type=month],.input-group-sm input[type=time],input[type=date].input-sm,input[type=datetime-local].input-sm,input[type=month].input-sm,input[type=time].input-sm{line-height:30px}.input-group-lg input[type=date],.input-group-lg input[type=datetime-local],.input-group-lg input[type=month],.input-group-lg input[type=time],input[type=date].input-lg,input[type=datetime-local].input-lg,input[type=month].input-lg,input[type=time].input-lg{line-height:46px}}.form-group{margin-bottom:15px}.checkbox,.radio{position:relative;display:block;margin-top:10px;margin-bottom:10px}.checkbox.disabled label,.radio.disabled label,fieldset[disabled] .checkbox label,fieldset[disabled] .radio label{cursor:not-allowed}.checkbox label,.radio label{min-height:20px;padding-left:20px;margin-bottom:0;font-weight:400;cursor:pointer}.checkbox input[type=checkbox],.checkbox-inline input[type=checkbox],.radio input[type=radio],.radio-inline input[type=radio]{position:absolute;margin-top:4px\9;margin-left:-20px}.checkbox+.checkbox,.radio+.radio{margin-top:-5px}.checkbox-inline,.radio-inline{position:relative;display:inline-block;padding-left:20px;margin-bottom:0;font-weight:400;vertical-align:middle;cursor:pointer}.checkbox-inline.disabled,.radio-inline.disabled,fieldset[disabled] .checkbox-inline,fieldset[disabled] .radio-inline{cursor:not-allowed}.checkbox-inline+.checkbox-inline,.radio-inline+.radio-inline{margin-top:0;margin-left:10px}.form-control-static{min-height:34px;padding-top:7px;padding-bottom:7px;margin-bottom:0}.form-control-static.input-lg,.form-control-static.input-sm{padding-right:0;padding-left:0}.input-sm{height:30px;padding:5px 10px;font-size:12px;line-height:1.5;border-radius:3px}select.input-sm{height:30px;line-height:30px}select[multiple].input-sm,textarea.input-sm{height:auto}.form-group-sm .form-control{height:30px;padding:5px 10px;font-size:12px;line-height:1.5;border-radius:3px}.form-group-sm select.form-control{height:30px;line-height:30px}.form-group-sm select[multiple].form-control,.form-group-sm textarea.form-control{height:auto}.form-group-sm .form-control-static{height:30px;min-height:32px;padding:6px 10px;font-size:12px;line-height:1.5}.input-lg{height:46px;padding:10px 16px;font-size:18px;line-height:1.3333333;border-radius:6px}select.input-lg{height:46px;line-height:46px}select[multiple].input-lg,textarea.input-lg{height:auto}.form-group-lg .form-control{height:46px;padding:10px 16px;font-size:18px;line-height:1.3333333;border-radius:6px}.form-group-lg select.form-control{height:46px;line-height:46px}.form-group-lg select[multiple].form-control,.form-group-lg textarea.form-control{height:auto}.form-group-lg .form-control-static{height:46px;min-height:38px;padding:11px 16px;font-size:18px;line-height:1.3333333}.has-feedback{position:relative}.has-feedback .form-control{padding-right:42.5px}.form-control-feedback{position:absolute;top:0;right:0;z-index:2;display:block;width:34px;height:34px;line-height:34px;text-align:center;pointer-events:none}.form-group-lg .form-control+.form-control-feedback,.input-group-lg+.form-control-feedback,.input-lg+.form-control-feedback{width:46px;height:46px;line-height:46px}.form-group-sm .form-control+.form-control-feedback,.input-group-sm+.form-control-feedback,.input-sm+.form-control-feedback{width:30px;height:30px;line-height:30px}.has-success .checkbox,.has-success .checkbox-inline,.has-success .control-label,.has-success .help-block,.has-success .radio,.has-success .radio-inline,.has-success.checkbox label,.has-success.checkbox-inline label,.has-success.radio label,.has-success.radio-inline label{color:#3c763d}.has-success .form-control{border-color:#3c763d;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 1px rgba(0,0,0,.075)}.has-success .form-control:focus{border-color:#2b542c;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 6px #67b168;box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 6px #67b168}.has-success .input-group-addon{color:#3c763d;background-color:#dff0d8;border-color:#3c763d}.has-success .form-control-feedback{color:#3c763d}.has-warning .checkbox,.has-warning .checkbox-inline,.has-warning .control-label,.has-warning .help-block,.has-warning .radio,.has-warning .radio-inline,.has-warning.checkbox label,.has-warning.checkbox-inline label,.has-warning.radio label,.has-warning.radio-inline label{color:#8a6d3b}.has-warning .form-control{border-color:#8a6d3b;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 1px rgba(0,0,0,.075)}.has-warning .form-control:focus{border-color:#66512c;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 6px #c0a16b;box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 6px #c0a16b}.has-warning .input-group-addon{color:#8a6d3b;background-color:#fcf8e3;border-color:#8a6d3b}.has-warning .form-control-feedback{color:#8a6d3b}.has-error .checkbox,.has-error .checkbox-inline,.has-error .control-label,.has-error .help-block,.has-error .radio,.has-error .radio-inline,.has-error.checkbox label,.has-error.checkbox-inline label,.has-error.radio label,.has-error.radio-inline label{color:#a94442}.has-error .form-control{border-color:#a94442;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 1px rgba(0,0,0,.075)}.has-error .form-control:focus{border-color:#843534;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 6px #ce8483;box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 6px #ce8483}.has-error .input-group-addon{color:#a94442;background-color:#f2dede;border-color:#a94442}.has-error .form-control-feedback{color:#a94442}.has-feedback label~.form-control-feedback{top:25px}.has-feedback label.sr-only~.form-control-feedback{top:0}.help-block{display:block;margin-top:5px;margin-bottom:10px;color:#737373}@media (min-width:768px){.form-inline .form-group{display:inline-block;margin-bottom:0;vertical-align:middle}.form-inline .form-control{display:inline-block;width:auto;vertical-align:middle}.form-inline .form-control-static{display:inline-block}.form-inline .input-group{display:inline-table;vertical-align:middle}.form-inline .input-group .form-control,.form-inline .input-group .input-group-addon,.form-inline .input-group .input-group-btn{width:auto}.form-inline .input-group>.form-control{width:100%}.form-inline .control-label{margin-bottom:0;vertical-align:middle}.form-inline .checkbox,.form-inline .radio{display:inline-block;margin-top:0;margin-bottom:0;vertical-align:middle}.form-inline .checkbox label,.form-inline .radio label{padding-left:0}.form-inline .checkbox input[type=checkbox],.form-inline .radio input[type=radio]{position:relative;margin-left:0}.form-inline .has-feedback .form-control-feedback{top:0}}.form-horizontal .checkbox,.form-horizontal .checkbox-inline,.form-horizontal .radio,.form-horizontal .radio-inline{padding-top:7px;margin-top:0;margin-bottom:0}.form-horizontal .checkbox,.form-horizontal .radio{min-height:27px}.form-horizontal .form-group{margin-right:-15px;margin-left:-15px}@media (min-width:768px){.form-horizontal .control-label{padding-top:7px;margin-bottom:0;text-align:right}}.form-horizontal .has-feedback .form-control-feedback{right:15px}@media (min-width:768px){.form-horizontal .form-group-lg .control-label{padding-top:11px;font-size:18px}}@media (min-width:768px){.form-horizontal .form-group-sm .control-label{padding-top:6px;font-size:12px}}.btn{display:inline-block;margin-bottom:0;font-weight:400;text-align:center;white-space:nowrap;vertical-align:middle;-ms-touch-action:manipulation;touch-action:manipulation;cursor:pointer;background-image:none;border:1px solid transparent;padding:6px 12px;font-size:14px;line-height:1.42857143;border-radius:4px;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.btn.active.focus,.btn.active:focus,.btn.focus,.btn:active.focus,.btn:active:focus,.btn:focus{outline:5px auto -webkit-focus-ring-color;outline-offset:-2px}.btn.focus,.btn:focus,.btn:hover{color:#333;text-decoration:none}.btn.active,.btn:active{background-image:none;outline:0;-webkit-box-shadow:inset 0 3px 5px rgba(0,0,0,.125);box-shadow:inset 0 3px 5px rgba(0,0,0,.125)}.btn.disabled,.btn[disabled],fieldset[disabled] .btn{cursor:not-allowed;filter:alpha(opacity=65);opacity:.65;-webkit-box-shadow:none;box-shadow:none}a.btn.disabled,fieldset[disabled] a.btn{pointer-events:none}.btn-default{color:#333;background-color:#fff;border-color:#ccc}.btn-default.focus,.btn-default:focus{color:#333;background-color:#e6e6e6;border-color:#8c8c8c}.btn-default:hover{color:#333;background-color:#e6e6e6;border-color:#adadad}.btn-default.active,.btn-default:active,.open>.dropdown-toggle.btn-default{color:#333;background-color:#e6e6e6;background-image:none;border-color:#adadad}.btn-default.active.focus,.btn-default.active:focus,.btn-default.active:hover,.btn-default:active.focus,.btn-default:active:focus,.btn-default:active:hover,.open>.dropdown-toggle.btn-default.focus,.open>.dropdown-toggle.btn-default:focus,.open>.dropdown-toggle.btn-default:hover{color:#333;background-color:#d4d4d4;border-color:#8c8c8c}.btn-default.disabled.focus,.btn-default.disabled:focus,.btn-default.disabled:hover,.btn-default[disabled].focus,.btn-default[disabled]:focus,.btn-default[disabled]:hover,fieldset[disabled] .btn-default.focus,fieldset[disabled] .btn-default:focus,fieldset[disabled] .btn-default:hover{background-color:#fff;border-color:#ccc}.btn-default .badge{color:#fff;background-color:#333}.btn-primary{color:#fff;background-color:#337ab7;border-color:#2e6da4}.btn-primary.focus,.btn-primary:focus{color:#fff;background-color:#286090;border-color:#122b40}.btn-primary:hover{color:#fff;background-color:#286090;border-color:#204d74}.btn-primary.active,.btn-primary:active,.open>.dropdown-toggle.btn-primary{color:#fff;background-color:#286090;background-image:none;border-color:#204d74}.btn-primary.active.focus,.btn-primary.active:focus,.btn-primary.active:hover,.btn-primary:active.focus,.btn-primary:active:focus,.btn-primary:active:hover,.open>.dropdown-toggle.btn-primary.focus,.open>.dropdown-toggle.btn-primary:focus,.open>.dropdown-toggle.btn-primary:hover{color:#fff;background-color:#204d74;border-color:#122b40}.btn-primary.disabled.focus,.btn-primary.disabled:focus,.btn-primary.disabled:hover,.btn-primary[disabled].focus,.btn-primary[disabled]:focus,.btn-primary[disabled]:hover,fieldset[disabled] .btn-primary.focus,fieldset[disabled] .btn-primary:focus,fieldset[disabled] .btn-primary:hover{background-color:#337ab7;border-color:#2e6da4}.btn-primary .badge{color:#337ab7;background-color:#fff}.btn-success{color:#fff;background-color:#5cb85c;border-color:#4cae4c}.btn-success.focus,.btn-success:focus{color:#fff;background-color:#449d44;border-color:#255625}.btn-success:hover{color:#fff;background-color:#449d44;border-color:#398439}.btn-success.active,.btn-success:active,.open>.dropdown-toggle.btn-success{color:#fff;background-color:#449d44;background-image:none;border-color:#398439}.btn-success.active.focus,.btn-success.active:focus,.btn-success.active:hover,.btn-success:active.focus,.btn-success:active:focus,.btn-success:active:hover,.open>.dropdown-toggle.btn-success.focus,.open>.dropdown-toggle.btn-success:focus,.open>.dropdown-toggle.btn-success:hover{color:#fff;background-color:#398439;border-color:#255625}.btn-success.disabled.focus,.btn-success.disabled:focus,.btn-success.disabled:hover,.btn-success[disabled].focus,.btn-success[disabled]:focus,.btn-success[disabled]:hover,fieldset[disabled] .btn-success.focus,fieldset[disabled] .btn-success:focus,fieldset[disabled] .btn-success:hover{background-color:#5cb85c;border-color:#4cae4c}.btn-success .badge{color:#5cb85c;background-color:#fff}.btn-info{color:#fff;background-color:#5bc0de;border-color:#46b8da}.btn-info.focus,.btn-info:focus{color:#fff;background-color:#31b0d5;border-color:#1b6d85}.btn-info:hover{color:#fff;background-color:#31b0d5;border-color:#269abc}.btn-info.active,.btn-info:active,.open>.dropdown-toggle.btn-info{color:#fff;background-color:#31b0d5;background-image:none;border-color:#269abc}.btn-info.active.focus,.btn-info.active:focus,.btn-info.active:hover,.btn-info:active.focus,.btn-info:active:focus,.btn-info:active:hover,.open>.dropdown-toggle.btn-info.focus,.open>.dropdown-toggle.btn-info:focus,.open>.dropdown-toggle.btn-info:hover{color:#fff;background-color:#269abc;border-color:#1b6d85}.btn-info.disabled.focus,.btn-info.disabled:focus,.btn-info.disabled:hover,.btn-info[disabled].focus,.btn-info[disabled]:focus,.btn-info[disabled]:hover,fieldset[disabled] .btn-info.focus,fieldset[disabled] .btn-info:focus,fieldset[disabled] .btn-info:hover{background-color:#5bc0de;border-color:#46b8da}.btn-info .badge{color:#5bc0de;background-color:#fff}.btn-warning{color:#fff;background-color:#f0ad4e;border-color:#eea236}.btn-warning.focus,.btn-warning:focus{color:#fff;background-color:#ec971f;border-color:#985f0d}.btn-warning:hover{color:#fff;background-color:#ec971f;border-color:#d58512}.btn-warning.active,.btn-warning:active,.open>.dropdown-toggle.btn-warning{color:#fff;background-color:#ec971f;background-image:none;border-color:#d58512}.btn-warning.active.focus,.btn-warning.active:focus,.btn-warning.active:hover,.btn-warning:active.focus,.btn-warning:active:focus,.btn-warning:active:hover,.open>.dropdown-toggle.btn-warning.focus,.open>.dropdown-toggle.btn-warning:focus,.open>.dropdown-toggle.btn-warning:hover{color:#fff;background-color:#d58512;border-color:#985f0d}.btn-warning.disabled.focus,.btn-warning.disabled:focus,.btn-warning.disabled:hover,.btn-warning[disabled].focus,.btn-warning[disabled]:focus,.btn-warning[disabled]:hover,fieldset[disabled] .btn-warning.focus,fieldset[disabled] .btn-warning:focus,fieldset[disabled] .btn-warning:hover{background-color:#f0ad4e;border-color:#eea236}.btn-warning .badge{color:#f0ad4e;background-color:#fff}.btn-danger{color:#fff;background-color:#d9534f;border-color:#d43f3a}.btn-danger.focus,.btn-danger:focus{color:#fff;background-color:#c9302c;border-color:#761c19}.btn-danger:hover{color:#fff;background-color:#c9302c;border-color:#ac2925}.btn-danger.active,.btn-danger:active,.open>.dropdown-toggle.btn-danger{color:#fff;background-color:#c9302c;background-image:none;border-color:#ac2925}.btn-danger.active.focus,.btn-danger.active:focus,.btn-danger.active:hover,.btn-danger:active.focus,.btn-danger:active:focus,.btn-danger:active:hover,.open>.dropdown-toggle.btn-danger.focus,.open>.dropdown-toggle.btn-danger:focus,.open>.dropdown-toggle.btn-danger:hover{color:#fff;background-color:#ac2925;border-color:#761c19}.btn-danger.disabled.focus,.btn-danger.disabled:focus,.btn-danger.disabled:hover,.btn-danger[disabled].focus,.btn-danger[disabled]:focus,.btn-danger[disabled]:hover,fieldset[disabled] .btn-danger.focus,fieldset[disabled] .btn-danger:focus,fieldset[disabled] .btn-danger:hover{background-color:#d9534f;border-color:#d43f3a}.btn-danger .badge{color:#d9534f;background-color:#fff}.btn-link{font-weight:400;color:#337ab7;border-radius:0}.btn-link,.btn-link.active,.btn-link:active,.btn-link[disabled],fieldset[disabled] .btn-link{background-color:transparent;-webkit-box-shadow:none;box-shadow:none}.btn-link,.btn-link:active,.btn-link:focus,.btn-link:hover{border-color:transparent}.btn-link:focus,.btn-link:hover{color:#23527c;text-decoration:underline;background-color:transparent}.btn-link[disabled]:focus,.btn-link[disabled]:hover,fieldset[disabled] .btn-link:focus,fieldset[disabled] .btn-link:hover{color:#777;text-decoration:none}.btn-group-lg>.btn,.btn-lg{padding:10px 16px;font-size:18px;line-height:1.3333333;border-radius:6px}.btn-group-sm>.btn,.btn-sm{padding:5px 10px;font-size:12px;line-height:1.5;border-radius:3px}.btn-group-xs>.btn,.btn-xs{padding:1px 5px;font-size:12px;line-height:1.5;border-radius:3px}.btn-block{display:block;width:100%}.btn-block+.btn-block{margin-top:5px}input[type=button].btn-block,input[type=reset].btn-block,input[type=submit].btn-block{width:100%}.fade{opacity:0;-webkit-transition:opacity .15s linear;-o-transition:opacity .15s linear;transition:opacity .15s linear}.fade.in{opacity:1}.collapse{display:none}.collapse.in{display:block}tr.collapse.in{display:table-row}tbody.collapse.in{display:table-row-group}.collapsing{position:relative;height:0;overflow:hidden;-webkit-transition-property:height,visibility;-o-transition-property:height,visibility;transition-property:height,visibility;-webkit-transition-duration:.35s;-o-transition-duration:.35s;transition-duration:.35s;-webkit-transition-timing-function:ease;-o-transition-timing-function:ease;transition-timing-function:ease}.caret{display:inline-block;width:0;height:0;margin-left:2px;vertical-align:middle;border-top:4px dashed;border-top:4px solid\9;border-right:4px solid transparent;border-left:4px solid transparent}.dropdown,.dropup{position:relative}.dropdown-toggle:focus{outline:0}.dropdown-menu{position:absolute;top:100%;left:0;z-index:1000;display:none;float:left;min-width:160px;padding:5px 0;margin:2px 0 0;font-size:14px;text-align:left;list-style:none;background-color:#fff;background-clip:padding-box;border:1px solid #ccc;border:1px solid rgba(0,0,0,.15);border-radius:4px;-webkit-box-shadow:0 6px 12px rgba(0,0,0,.175);box-shadow:0 6px 12px rgba(0,0,0,.175)}.dropdown-menu.pull-right{right:0;left:auto}.dropdown-menu .divider{height:1px;margin:9px 0;overflow:hidden;background-color:#e5e5e5}.dropdown-menu>li>a{display:block;padding:3px 20px;clear:both;font-weight:400;line-height:1.42857143;color:#333;white-space:nowrap}.dropdown-menu>li>a:focus,.dropdown-menu>li>a:hover{color:#262626;text-decoration:none;background-color:#f5f5f5}.dropdown-menu>.active>a,.dropdown-menu>.active>a:focus,.dropdown-menu>.active>a:hover{color:#fff;text-decoration:none;background-color:#337ab7;outline:0}.dropdown-menu>.disabled>a,.dropdown-menu>.disabled>a:focus,.dropdown-menu>.disabled>a:hover{color:#777}.dropdown-menu>.disabled>a:focus,.dropdown-menu>.disabled>a:hover{text-decoration:none;cursor:not-allowed;background-color:transparent;background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled=false)}.open>.dropdown-menu{display:block}.open>a{outline:0}.dropdown-menu-right{right:0;left:auto}.dropdown-menu-left{right:auto;left:0}.dropdown-header{display:block;padding:3px 20px;font-size:12px;line-height:1.42857143;color:#777;white-space:nowrap}.dropdown-backdrop{position:fixed;top:0;right:0;bottom:0;left:0;z-index:990}.pull-right>.dropdown-menu{right:0;left:auto}.dropup .caret,.navbar-fixed-bottom .dropdown .caret{content:"";border-top:0;border-bottom:4px dashed;border-bottom:4px solid\9}.dropup .dropdown-menu,.navbar-fixed-bottom .dropdown .dropdown-menu{top:auto;bottom:100%;margin-bottom:2px}@media (min-width:768px){.navbar-right .dropdown-menu{right:0;left:auto}.navbar-right .dropdown-menu-left{right:auto;left:0}}.btn-group,.btn-group-vertical{position:relative;display:inline-block;vertical-align:middle}.btn-group-vertical>.btn,.btn-group>.btn{position:relative;float:left}.btn-group-vertical>.btn.active,.btn-group-vertical>.btn:active,.btn-group-vertical>.btn:focus,.btn-group-vertical>.btn:hover,.btn-group>.btn.active,.btn-group>.btn:active,.btn-group>.btn:focus,.btn-group>.btn:hover{z-index:2}.btn-group .btn+.btn,.btn-group .btn+.btn-group,.btn-group .btn-group+.btn,.btn-group .btn-group+.btn-group{margin-left:-1px}.btn-toolbar{margin-left:-5px}.btn-toolbar .btn,.btn-toolbar .btn-group,.btn-toolbar .input-group{float:left}.btn-toolbar>.btn,.btn-toolbar>.btn-group,.btn-toolbar>.input-group{margin-left:5px}.btn-group>.btn:not(:first-child):not(:last-child):not(.dropdown-toggle){border-radius:0}.btn-group>.btn:first-child{margin-left:0}.btn-group>.btn:first-child:not(:last-child):not(.dropdown-toggle){border-top-right-radius:0;border-bottom-right-radius:0}.btn-group>.btn:last-child:not(:first-child),.btn-group>.dropdown-toggle:not(:first-child){border-top-left-radius:0;border-bottom-left-radius:0}.btn-group>.btn-group{float:left}.btn-group>.btn-group:not(:first-child):not(:last-child)>.btn{border-radius:0}.btn-group>.btn-group:first-child:not(:last-child)>.btn:last-child,.btn-group>.btn-group:first-child:not(:last-child)>.dropdown-toggle{border-top-right-radius:0;border-bottom-right-radius:0}.btn-group>.btn-group:last-child:not(:first-child)>.btn:first-child{border-top-left-radius:0;border-bottom-left-radius:0}.btn-group .dropdown-toggle:active,.btn-group.open .dropdown-toggle{outline:0}.btn-group>.btn+.dropdown-toggle{padding-right:8px;padding-left:8px}.btn-group>.btn-lg+.dropdown-toggle{padding-right:12px;padding-left:12px}.btn-group.open .dropdown-toggle{-webkit-box-shadow:inset 0 3px 5px rgba(0,0,0,.125);box-shadow:inset 0 3px 5px rgba(0,0,0,.125)}.btn-group.open .dropdown-toggle.btn-link{-webkit-box-shadow:none;box-shadow:none}.btn .caret{margin-left:0}.btn-lg .caret{border-width:5px 5px 0;border-bottom-width:0}.dropup .btn-lg .caret{border-width:0 5px 5px}.btn-group-vertical>.btn,.btn-group-vertical>.btn-group,.btn-group-vertical>.btn-group>.btn{display:block;float:none;width:100%;max-width:100%}.btn-group-vertical>.btn-group>.btn{float:none}.btn-group-vertical>.btn+.btn,.btn-group-vertical>.btn+.btn-group,.btn-group-vertical>.btn-group+.btn,.btn-group-vertical>.btn-group+.btn-group{margin-top:-1px;margin-left:0}.btn-group-vertical>.btn:not(:first-child):not(:last-child){border-radius:0}.btn-group-vertical>.btn:first-child:not(:last-child){border-top-left-radius:4px;border-top-right-radius:4px;border-bottom-right-radius:0;border-bottom-left-radius:0}.btn-group-vertical>.btn:last-child:not(:first-child){border-top-left-radius:0;border-top-right-radius:0;border-bottom-right-radius:4px;border-bottom-left-radius:4px}.btn-group-vertical>.btn-group:not(:first-child):not(:last-child)>.btn{border-radius:0}.btn-group-vertical>.btn-group:first-child:not(:last-child)>.btn:last-child,.btn-group-vertical>.btn-group:first-child:not(:last-child)>.dropdown-toggle{border-bottom-right-radius:0;border-bottom-left-radius:0}.btn-group-vertical>.btn-group:last-child:not(:first-child)>.btn:first-child{border-top-left-radius:0;border-top-right-radius:0}.btn-group-justified{display:table;width:100%;table-layout:fixed;border-collapse:separate}.btn-group-justified>.btn,.btn-group-justified>.btn-group{display:table-cell;float:none;width:1%}.btn-group-justified>.btn-group .btn{width:100%}.btn-group-justified>.btn-group .dropdown-menu{left:auto}[data-toggle=buttons]>.btn input[type=checkbox],[data-toggle=buttons]>.btn input[type=radio],[data-toggle=buttons]>.btn-group>.btn input[type=checkbox],[data-toggle=buttons]>.btn-group>.btn input[type=radio]{position:absolute;clip:rect(0,0,0,0);pointer-events:none}.input-group{position:relative;display:table;border-collapse:separate}.input-group[class*=col-]{float:none;padding-right:0;padding-left:0}.input-group .form-control{position:relative;z-index:2;float:left;width:100%;margin-bottom:0}.input-group .form-control:focus{z-index:3}.input-group-lg>.form-control,.input-group-lg>.input-group-addon,.input-group-lg>.input-group-btn>.btn{height:46px;padding:10px 16px;font-size:18px;line-height:1.3333333;border-radius:6px}select.input-group-lg>.form-control,select.input-group-lg>.input-group-addon,select.input-group-lg>.input-group-btn>.btn{height:46px;line-height:46px}select[multiple].input-group-lg>.form-control,select[multiple].input-group-lg>.input-group-addon,select[multiple].input-group-lg>.input-group-btn>.btn,textarea.input-group-lg>.form-control,textarea.input-group-lg>.input-group-addon,textarea.input-group-lg>.input-group-btn>.btn{height:auto}.input-group-sm>.form-control,.input-group-sm>.input-group-addon,.input-group-sm>.input-group-btn>.btn{height:30px;padding:5px 10px;font-size:12px;line-height:1.5;border-radius:3px}select.input-group-sm>.form-control,select.input-group-sm>.input-group-addon,select.input-group-sm>.input-group-btn>.btn{height:30px;line-height:30px}select[multiple].input-group-sm>.form-control,select[multiple].input-group-sm>.input-group-addon,select[multiple].input-group-sm>.input-group-btn>.btn,textarea.input-group-sm>.form-control,textarea.input-group-sm>.input-group-addon,textarea.input-group-sm>.input-group-btn>.btn{height:auto}.input-group .form-control,.input-group-addon,.input-group-btn{display:table-cell}.input-group .form-control:not(:first-child):not(:last-child),.input-group-addon:not(:first-child):not(:last-child),.input-group-btn:not(:first-child):not(:last-child){border-radius:0}.input-group-addon,.input-group-btn{width:1%;white-space:nowrap;vertical-align:middle}.input-group-addon{padding:6px 12px;font-size:14px;font-weight:400;line-height:1;color:#555;text-align:center;background-color:#eee;border:1px solid #ccc;border-radius:4px}.input-group-addon.input-sm{padding:5px 10px;font-size:12px;border-radius:3px}.input-group-addon.input-lg{padding:10px 16px;font-size:18px;border-radius:6px}.input-group-addon input[type=checkbox],.input-group-addon input[type=radio]{margin-top:0}.input-group .form-control:first-child,.input-group-addon:first-child,.input-group-btn:first-child>.btn,.input-group-btn:first-child>.btn-group>.btn,.input-group-btn:first-child>.dropdown-toggle,.input-group-btn:last-child>.btn-group:not(:last-child)>.btn,.input-group-btn:last-child>.btn:not(:last-child):not(.dropdown-toggle){border-top-right-radius:0;border-bottom-right-radius:0}.input-group-addon:first-child{border-right:0}.input-group .form-control:last-child,.input-group-addon:last-child,.input-group-btn:first-child>.btn-group:not(:first-child)>.btn,.input-group-btn:first-child>.btn:not(:first-child),.input-group-btn:last-child>.btn,.input-group-btn:last-child>.btn-group>.btn,.input-group-btn:last-child>.dropdown-toggle{border-top-left-radius:0;border-bottom-left-radius:0}.input-group-addon:last-child{border-left:0}.input-group-btn{position:relative;font-size:0;white-space:nowrap}.input-group-btn>.btn{position:relative}.input-group-btn>.btn+.btn{margin-left:-1px}.input-group-btn>.btn:active,.input-group-btn>.btn:focus,.input-group-btn>.btn:hover{z-index:2}.input-group-btn:first-child>.btn,.input-group-btn:first-child>.btn-group{margin-right:-1px}.input-group-btn:last-child>.btn,.input-group-btn:last-child>.btn-group{z-index:2;margin-left:-1px}.nav{padding-left:0;margin-bottom:0;list-style:none}.nav>li{position:relative;display:block}.nav>li>a{position:relative;display:block;padding:10px 15px}.nav>li>a:focus,.nav>li>a:hover{text-decoration:none;background-color:#eee}.nav>li.disabled>a{color:#777}.nav>li.disabled>a:focus,.nav>li.disabled>a:hover{color:#777;text-decoration:none;cursor:not-allowed;background-color:transparent}.nav .open>a,.nav .open>a:focus,.nav .open>a:hover{background-color:#eee;border-color:#337ab7}.nav .nav-divider{height:1px;margin:9px 0;overflow:hidden;background-color:#e5e5e5}.nav>li>a>img{max-width:none}.nav-tabs{border-bottom:1px solid #ddd}.nav-tabs>li{float:left;margin-bottom:-1px}.nav-tabs>li>a{margin-right:2px;line-height:1.42857143;border:1px solid transparent;border-radius:4px 4px 0 0}.nav-tabs>li>a:hover{border-color:#eee #eee #ddd}.nav-tabs>li.active>a,.nav-tabs>li.active>a:focus,.nav-tabs>li.active>a:hover{color:#555;cursor:default;background-color:#fff;border:1px solid #ddd;border-bottom-color:transparent}.nav-tabs.nav-justified{width:100%;border-bottom:0}.nav-tabs.nav-justified>li{float:none}.nav-tabs.nav-justified>li>a{margin-bottom:5px;text-align:center}.nav-tabs.nav-justified>.dropdown .dropdown-menu{top:auto;left:auto}@media (min-width:768px){.nav-tabs.nav-justified>li{display:table-cell;width:1%}.nav-tabs.nav-justified>li>a{margin-bottom:0}}.nav-tabs.nav-justified>li>a{margin-right:0;border-radius:4px}.nav-tabs.nav-justified>.active>a,.nav-tabs.nav-justified>.active>a:focus,.nav-tabs.nav-justified>.active>a:hover{border:1px solid #ddd}@media (min-width:768px){.nav-tabs.nav-justified>li>a{border-bottom:1px solid #ddd;border-radius:4px 4px 0 0}.nav-tabs.nav-justified>.active>a,.nav-tabs.nav-justified>.active>a:focus,.nav-tabs.nav-justified>.active>a:hover{border-bottom-color:#fff}}.nav-pills>li{float:left}.nav-pills>li>a{border-radius:4px}.nav-pills>li+li{margin-left:2px}.nav-pills>li.active>a,.nav-pills>li.active>a:focus,.nav-pills>li.active>a:hover{color:#fff;background-color:#337ab7}.nav-stacked>li{float:none}.nav-stacked>li+li{margin-top:2px;margin-left:0}.nav-justified{width:100%}.nav-justified>li{float:none}.nav-justified>li>a{margin-bottom:5px;text-align:center}.nav-justified>.dropdown .dropdown-menu{top:auto;left:auto}@media (min-width:768px){.nav-justified>li{display:table-cell;width:1%}.nav-justified>li>a{margin-bottom:0}}.nav-tabs-justified{border-bottom:0}.nav-tabs-justified>li>a{margin-right:0;border-radius:4px}.nav-tabs-justified>.active>a,.nav-tabs-justified>.active>a:focus,.nav-tabs-justified>.active>a:hover{border:1px solid #ddd}@media (min-width:768px){.nav-tabs-justified>li>a{border-bottom:1px solid #ddd;border-radius:4px 4px 0 0}.nav-tabs-justified>.active>a,.nav-tabs-justified>.active>a:focus,.nav-tabs-justified>.active>a:hover{border-bottom-color:#fff}}.tab-content>.tab-pane{display:none}.tab-content>.active{display:block}.nav-tabs .dropdown-menu{margin-top:-1px;border-top-left-radius:0;border-top-right-radius:0}.navbar{position:relative;min-height:50px;margin-bottom:20px;border:1px solid transparent}@media (min-width:768px){.navbar{border-radius:4px}}@media (min-width:768px){.navbar-header{float:left}}.navbar-collapse{padding-right:15px;padding-left:15px;overflow-x:visible;border-top:1px solid transparent;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.1);box-shadow:inset 0 1px 0 rgba(255,255,255,.1);-webkit-overflow-scrolling:touch}.navbar-collapse.in{overflow-y:auto}@media (min-width:768px){.navbar-collapse{width:auto;border-top:0;-webkit-box-shadow:none;box-shadow:none}.navbar-collapse.collapse{display:block!important;height:auto!important;padding-bottom:0;overflow:visible!important}.navbar-collapse.in{overflow-y:visible}.navbar-fixed-bottom .navbar-collapse,.navbar-fixed-top .navbar-collapse,.navbar-static-top .navbar-collapse{padding-right:0;padding-left:0}}.navbar-fixed-bottom,.navbar-fixed-top{position:fixed;right:0;left:0;z-index:1030}.navbar-fixed-bottom .navbar-collapse,.navbar-fixed-top .navbar-collapse{max-height:340px}@media (max-device-width:480px) and (orientation:landscape){.navbar-fixed-bottom .navbar-collapse,.navbar-fixed-top .navbar-collapse{max-height:200px}}@media (min-width:768px){.navbar-fixed-bottom,.navbar-fixed-top{border-radius:0}}.navbar-fixed-top{top:0;border-width:0 0 1px}.navbar-fixed-bottom{bottom:0;margin-bottom:0;border-width:1px 0 0}.container-fluid>.navbar-collapse,.container-fluid>.navbar-header,.container>.navbar-collapse,.container>.navbar-header{margin-right:-15px;margin-left:-15px}@media (min-width:768px){.container-fluid>.navbar-collapse,.container-fluid>.navbar-header,.container>.navbar-collapse,.container>.navbar-header{margin-right:0;margin-left:0}}.navbar-static-top{z-index:1000;border-width:0 0 1px}@media (min-width:768px){.navbar-static-top{border-radius:0}}.navbar-brand{float:left;height:50px;padding:15px 15px;font-size:18px;line-height:20px}.navbar-brand:focus,.navbar-brand:hover{text-decoration:none}.navbar-brand>img{display:block}@media (min-width:768px){.navbar>.container .navbar-brand,.navbar>.container-fluid .navbar-brand{margin-left:-15px}}.navbar-toggle{position:relative;float:right;padding:9px 10px;margin-right:15px;margin-top:8px;margin-bottom:8px;background-color:transparent;background-image:none;border:1px solid transparent;border-radius:4px}.navbar-toggle:focus{outline:0}.navbar-toggle .icon-bar{display:block;width:22px;height:2px;border-radius:1px}.navbar-toggle .icon-bar+.icon-bar{margin-top:4px}@media (min-width:768px){.navbar-toggle{display:none}}.navbar-nav{margin:7.5px -15px}.navbar-nav>li>a{padding-top:10px;padding-bottom:10px;line-height:20px}@media (max-width:767px){.navbar-nav .open .dropdown-menu{position:static;float:none;width:auto;margin-top:0;background-color:transparent;border:0;-webkit-box-shadow:none;box-shadow:none}.navbar-nav .open .dropdown-menu .dropdown-header,.navbar-nav .open .dropdown-menu>li>a{padding:5px 15px 5px 25px}.navbar-nav .open .dropdown-menu>li>a{line-height:20px}.navbar-nav .open .dropdown-menu>li>a:focus,.navbar-nav .open .dropdown-menu>li>a:hover{background-image:none}}@media (min-width:768px){.navbar-nav{float:left;margin:0}.navbar-nav>li{float:left}.navbar-nav>li>a{padding-top:15px;padding-bottom:15px}}.navbar-form{padding:10px 15px;margin-right:-15px;margin-left:-15px;border-top:1px solid transparent;border-bottom:1px solid transparent;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.1),0 1px 0 rgba(255,255,255,.1);box-shadow:inset 0 1px 0 rgba(255,255,255,.1),0 1px 0 rgba(255,255,255,.1);margin-top:8px;margin-bottom:8px}@media (min-width:768px){.navbar-form .form-group{display:inline-block;margin-bottom:0;vertical-align:middle}.navbar-form .form-control{display:inline-block;width:auto;vertical-align:middle}.navbar-form .form-control-static{display:inline-block}.navbar-form .input-group{display:inline-table;vertical-align:middle}.navbar-form .input-group .form-control,.navbar-form .input-group .input-group-addon,.navbar-form .input-group .input-group-btn{width:auto}.navbar-form .input-group>.form-control{width:100%}.navbar-form .control-label{margin-bottom:0;vertical-align:middle}.navbar-form .checkbox,.navbar-form .radio{display:inline-block;margin-top:0;margin-bottom:0;vertical-align:middle}.navbar-form .checkbox label,.navbar-form .radio label{padding-left:0}.navbar-form .checkbox input[type=checkbox],.navbar-form .radio input[type=radio]{position:relative;margin-left:0}.navbar-form .has-feedback .form-control-feedback{top:0}}@media (max-width:767px){.navbar-form .form-group{margin-bottom:5px}.navbar-form .form-group:last-child{margin-bottom:0}}@media (min-width:768px){.navbar-form{width:auto;padding-top:0;padding-bottom:0;margin-right:0;margin-left:0;border:0;-webkit-box-shadow:none;box-shadow:none}}.navbar-nav>li>.dropdown-menu{margin-top:0;border-top-left-radius:0;border-top-right-radius:0}.navbar-fixed-bottom .navbar-nav>li>.dropdown-menu{margin-bottom:0;border-top-left-radius:4px;border-top-right-radius:4px;border-bottom-right-radius:0;border-bottom-left-radius:0}.navbar-btn{margin-top:8px;margin-bottom:8px}.navbar-btn.btn-sm{margin-top:10px;margin-bottom:10px}.navbar-btn.btn-xs{margin-top:14px;margin-bottom:14px}.navbar-text{margin-top:15px;margin-bottom:15px}@media (min-width:768px){.navbar-text{float:left;margin-right:15px;margin-left:15px}}@media (min-width:768px){.navbar-left{float:left!important}.navbar-right{float:right!important;margin-right:-15px}.navbar-right~.navbar-right{margin-right:0}}.navbar-default{background-color:#f8f8f8;border-color:#e7e7e7}.navbar-default .navbar-brand{color:#777}.navbar-default .navbar-brand:focus,.navbar-default .navbar-brand:hover{color:#5e5e5e;background-color:transparent}.navbar-default .navbar-text{color:#777}.navbar-default .navbar-nav>li>a{color:#777}.navbar-default .navbar-nav>li>a:focus,.navbar-default .navbar-nav>li>a:hover{color:#333;background-color:transparent}.navbar-default .navbar-nav>.active>a,.navbar-default .navbar-nav>.active>a:focus,.navbar-default .navbar-nav>.active>a:hover{color:#555;background-color:#e7e7e7}.navbar-default .navbar-nav>.disabled>a,.navbar-default .navbar-nav>.disabled>a:focus,.navbar-default .navbar-nav>.disabled>a:hover{color:#ccc;background-color:transparent}.navbar-default .navbar-nav>.open>a,.navbar-default .navbar-nav>.open>a:focus,.navbar-default .navbar-nav>.open>a:hover{color:#555;background-color:#e7e7e7}@media (max-width:767px){.navbar-default .navbar-nav .open .dropdown-menu>li>a{color:#777}.navbar-default .navbar-nav .open .dropdown-menu>li>a:focus,.navbar-default .navbar-nav .open .dropdown-menu>li>a:hover{color:#333;background-color:transparent}.navbar-default .navbar-nav .open .dropdown-menu>.active>a,.navbar-default .navbar-nav .open .dropdown-menu>.active>a:focus,.navbar-default .navbar-nav .open .dropdown-menu>.active>a:hover{color:#555;background-color:#e7e7e7}.navbar-default .navbar-nav .open .dropdown-menu>.disabled>a,.navbar-default .navbar-nav .open .dropdown-menu>.disabled>a:focus,.navbar-default .navbar-nav .open .dropdown-menu>.disabled>a:hover{color:#ccc;background-color:transparent}}.navbar-default .navbar-toggle{border-color:#ddd}.navbar-default .navbar-toggle:focus,.navbar-default .navbar-toggle:hover{background-color:#ddd}.navbar-default .navbar-toggle .icon-bar{background-color:#888}.navbar-default .navbar-collapse,.navbar-default .navbar-form{border-color:#e7e7e7}.navbar-default .navbar-link{color:#777}.navbar-default .navbar-link:hover{color:#333}.navbar-default .btn-link{color:#777}.navbar-default .btn-link:focus,.navbar-default .btn-link:hover{color:#333}.navbar-default .btn-link[disabled]:focus,.navbar-default .btn-link[disabled]:hover,fieldset[disabled] .navbar-default .btn-link:focus,fieldset[disabled] .navbar-default .btn-link:hover{color:#ccc}.navbar-inverse{background-color:#222;border-color:#080808}.navbar-inverse .navbar-brand{color:#9d9d9d}.navbar-inverse .navbar-brand:focus,.navbar-inverse .navbar-brand:hover{color:#fff;background-color:transparent}.navbar-inverse .navbar-text{color:#9d9d9d}.navbar-inverse .navbar-nav>li>a{color:#9d9d9d}.navbar-inverse .navbar-nav>li>a:focus,.navbar-inverse .navbar-nav>li>a:hover{color:#fff;background-color:transparent}.navbar-inverse .navbar-nav>.active>a,.navbar-inverse .navbar-nav>.active>a:focus,.navbar-inverse .navbar-nav>.active>a:hover{color:#fff;background-color:#080808}.navbar-inverse .navbar-nav>.disabled>a,.navbar-inverse .navbar-nav>.disabled>a:focus,.navbar-inverse .navbar-nav>.disabled>a:hover{color:#444;background-color:transparent}.navbar-inverse .navbar-nav>.open>a,.navbar-inverse .navbar-nav>.open>a:focus,.navbar-inverse .navbar-nav>.open>a:hover{color:#fff;background-color:#080808}@media (max-width:767px){.navbar-inverse .navbar-nav .open .dropdown-menu>.dropdown-header{border-color:#080808}.navbar-inverse .navbar-nav .open .dropdown-menu .divider{background-color:#080808}.navbar-inverse .navbar-nav .open .dropdown-menu>li>a{color:#9d9d9d}.navbar-inverse .navbar-nav .open .dropdown-menu>li>a:focus,.navbar-inverse .navbar-nav .open .dropdown-menu>li>a:hover{color:#fff;background-color:transparent}.navbar-inverse .navbar-nav .open .dropdown-menu>.active>a,.navbar-inverse .navbar-nav .open .dropdown-menu>.active>a:focus,.navbar-inverse .navbar-nav .open .dropdown-menu>.active>a:hover{color:#fff;background-color:#080808}.navbar-inverse .navbar-nav .open .dropdown-menu>.disabled>a,.navbar-inverse .navbar-nav .open .dropdown-menu>.disabled>a:focus,.navbar-inverse .navbar-nav .open .dropdown-menu>.disabled>a:hover{color:#444;background-color:transparent}}.navbar-inverse .navbar-toggle{border-color:#333}.navbar-inverse .navbar-toggle:focus,.navbar-inverse .navbar-toggle:hover{background-color:#333}.navbar-inverse .navbar-toggle .icon-bar{background-color:#fff}.navbar-inverse .navbar-collapse,.navbar-inverse .navbar-form{border-color:#101010}.navbar-inverse .navbar-link{color:#9d9d9d}.navbar-inverse .navbar-link:hover{color:#fff}.navbar-inverse .btn-link{color:#9d9d9d}.navbar-inverse .btn-link:focus,.navbar-inverse .btn-link:hover{color:#fff}.navbar-inverse .btn-link[disabled]:focus,.navbar-inverse .btn-link[disabled]:hover,fieldset[disabled] .navbar-inverse .btn-link:focus,fieldset[disabled] .navbar-inverse .btn-link:hover{color:#444}.breadcrumb{padding:8px 15px;margin-bottom:20px;list-style:none;background-color:#f5f5f5;border-radius:4px}.breadcrumb>li{display:inline-block}.breadcrumb>li+li:before{padding:0 5px;color:#ccc;content:"/\00a0"}.breadcrumb>.active{color:#777}.pagination{display:inline-block;padding-left:0;margin:20px 0;border-radius:4px}.pagination>li{display:inline}.pagination>li>a,.pagination>li>span{position:relative;float:left;padding:6px 12px;margin-left:-1px;line-height:1.42857143;color:#337ab7;text-decoration:none;background-color:#fff;border:1px solid #ddd}.pagination>li>a:focus,.pagination>li>a:hover,.pagination>li>span:focus,.pagination>li>span:hover{z-index:2;color:#23527c;background-color:#eee;border-color:#ddd}.pagination>li:first-child>a,.pagination>li:first-child>span{margin-left:0;border-top-left-radius:4px;border-bottom-left-radius:4px}.pagination>li:last-child>a,.pagination>li:last-child>span{border-top-right-radius:4px;border-bottom-right-radius:4px}.pagination>.active>a,.pagination>.active>a:focus,.pagination>.active>a:hover,.pagination>.active>span,.pagination>.active>span:focus,.pagination>.active>span:hover{z-index:3;color:#fff;cursor:default;background-color:#337ab7;border-color:#337ab7}.pagination>.disabled>a,.pagination>.disabled>a:focus,.pagination>.disabled>a:hover,.pagination>.disabled>span,.pagination>.disabled>span:focus,.pagination>.disabled>span:hover{color:#777;cursor:not-allowed;background-color:#fff;border-color:#ddd}.pagination-lg>li>a,.pagination-lg>li>span{padding:10px 16px;font-size:18px;line-height:1.3333333}.pagination-lg>li:first-child>a,.pagination-lg>li:first-child>span{border-top-left-radius:6px;border-bottom-left-radius:6px}.pagination-lg>li:last-child>a,.pagination-lg>li:last-child>span{border-top-right-radius:6px;border-bottom-right-radius:6px}.pagination-sm>li>a,.pagination-sm>li>span{padding:5px 10px;font-size:12px;line-height:1.5}.pagination-sm>li:first-child>a,.pagination-sm>li:first-child>span{border-top-left-radius:3px;border-bottom-left-radius:3px}.pagination-sm>li:last-child>a,.pagination-sm>li:last-child>span{border-top-right-radius:3px;border-bottom-right-radius:3px}.pager{padding-left:0;margin:20px 0;text-align:center;list-style:none}.pager li{display:inline}.pager li>a,.pager li>span{display:inline-block;padding:5px 14px;background-color:#fff;border:1px solid #ddd;border-radius:15px}.pager li>a:focus,.pager li>a:hover{text-decoration:none;background-color:#eee}.pager .next>a,.pager .next>span{float:right}.pager .previous>a,.pager .previous>span{float:left}.pager .disabled>a,.pager .disabled>a:focus,.pager .disabled>a:hover,.pager .disabled>span{color:#777;cursor:not-allowed;background-color:#fff}.label{display:inline;padding:.2em .6em .3em;font-size:75%;font-weight:700;line-height:1;color:#fff;text-align:center;white-space:nowrap;vertical-align:baseline;border-radius:.25em}a.label:focus,a.label:hover{color:#fff;text-decoration:none;cursor:pointer}.label:empty{display:none}.btn .label{position:relative;top:-1px}.label-default{background-color:#777}.label-default[href]:focus,.label-default[href]:hover{background-color:#5e5e5e}.label-primary{background-color:#337ab7}.label-primary[href]:focus,.label-primary[href]:hover{background-color:#286090}.label-success{background-color:#5cb85c}.label-success[href]:focus,.label-success[href]:hover{background-color:#449d44}.label-info{background-color:#5bc0de}.label-info[href]:focus,.label-info[href]:hover{background-color:#31b0d5}.label-warning{background-color:#f0ad4e}.label-warning[href]:focus,.label-warning[href]:hover{background-color:#ec971f}.label-danger{background-color:#d9534f}.label-danger[href]:focus,.label-danger[href]:hover{background-color:#c9302c}.badge{display:inline-block;min-width:10px;padding:3px 7px;font-size:12px;font-weight:700;line-height:1;color:#fff;text-align:center;white-space:nowrap;vertical-align:middle;background-color:#777;border-radius:10px}.badge:empty{display:none}.btn .badge{position:relative;top:-1px}.btn-group-xs>.btn .badge,.btn-xs .badge{top:0;padding:1px 5px}a.badge:focus,a.badge:hover{color:#fff;text-decoration:none;cursor:pointer}.list-group-item.active>.badge,.nav-pills>.active>a>.badge{color:#337ab7;background-color:#fff}.list-group-item>.badge{float:right}.list-group-item>.badge+.badge{margin-right:5px}.nav-pills>li>a>.badge{margin-left:3px}.jumbotron{padding-top:30px;padding-bottom:30px;margin-bottom:30px;color:inherit;background-color:#eee}.jumbotron .h1,.jumbotron h1{color:inherit}.jumbotron p{margin-bottom:15px;font-size:21px;font-weight:200}.jumbotron>hr{border-top-color:#d5d5d5}.container .jumbotron,.container-fluid .jumbotron{padding-right:15px;padding-left:15px;border-radius:6px}.jumbotron .container{max-width:100%}@media screen and (min-width:768px){.jumbotron{padding-top:48px;padding-bottom:48px}.container .jumbotron,.container-fluid .jumbotron{padding-right:60px;padding-left:60px}.jumbotron .h1,.jumbotron h1{font-size:63px}}.thumbnail{display:block;padding:4px;margin-bottom:20px;line-height:1.42857143;background-color:#fff;border:1px solid #ddd;border-radius:4px;-webkit-transition:border .2s ease-in-out;-o-transition:border .2s ease-in-out;transition:border .2s ease-in-out}.thumbnail a>img,.thumbnail>img{margin-right:auto;margin-left:auto}a.thumbnail.active,a.thumbnail:focus,a.thumbnail:hover{border-color:#337ab7}.thumbnail .caption{padding:9px;color:#333}.alert{padding:15px;margin-bottom:20px;border:1px solid transparent;border-radius:4px}.alert h4{margin-top:0;color:inherit}.alert .alert-link{font-weight:700}.alert>p,.alert>ul{margin-bottom:0}.alert>p+p{margin-top:5px}.alert-dismissable,.alert-dismissible{padding-right:35px}.alert-dismissable .close,.alert-dismissible .close{position:relative;top:-2px;right:-21px;color:inherit}.alert-success{color:#3c763d;background-color:#dff0d8;border-color:#d6e9c6}.alert-success hr{border-top-color:#c9e2b3}.alert-success .alert-link{color:#2b542c}.alert-info{color:#31708f;background-color:#d9edf7;border-color:#bce8f1}.alert-info hr{border-top-color:#a6e1ec}.alert-info .alert-link{color:#245269}.alert-warning{color:#8a6d3b;background-color:#fcf8e3;border-color:#faebcc}.alert-warning hr{border-top-color:#f7e1b5}.alert-warning .alert-link{color:#66512c}.alert-danger{color:#a94442;background-color:#f2dede;border-color:#ebccd1}.alert-danger hr{border-top-color:#e4b9c0}.alert-danger .alert-link{color:#843534}@-webkit-keyframes progress-bar-stripes{from{background-position:40px 0}to{background-position:0 0}}@-o-keyframes progress-bar-stripes{from{background-position:40px 0}to{background-position:0 0}}@keyframes progress-bar-stripes{from{background-position:40px 0}to{background-position:0 0}}.progress{height:20px;margin-bottom:20px;overflow:hidden;background-color:#f5f5f5;border-radius:4px;-webkit-box-shadow:inset 0 1px 2px rgba(0,0,0,.1);box-shadow:inset 0 1px 2px rgba(0,0,0,.1)}.progress-bar{float:left;width:0%;height:100%;font-size:12px;line-height:20px;color:#fff;text-align:center;background-color:#337ab7;-webkit-box-shadow:inset 0 -1px 0 rgba(0,0,0,.15);box-shadow:inset 0 -1px 0 rgba(0,0,0,.15);-webkit-transition:width .6s ease;-o-transition:width .6s ease;transition:width .6s ease}.progress-bar-striped,.progress-striped .progress-bar{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);-webkit-background-size:40px 40px;background-size:40px 40px}.progress-bar.active,.progress.active .progress-bar{-webkit-animation:progress-bar-stripes 2s linear infinite;-o-animation:progress-bar-stripes 2s linear infinite;animation:progress-bar-stripes 2s linear infinite}.progress-bar-success{background-color:#5cb85c}.progress-striped .progress-bar-success{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent)}.progress-bar-info{background-color:#5bc0de}.progress-striped .progress-bar-info{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent)}.progress-bar-warning{background-color:#f0ad4e}.progress-striped .progress-bar-warning{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent)}.progress-bar-danger{background-color:#d9534f}.progress-striped .progress-bar-danger{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent)}.media{margin-top:15px}.media:first-child{margin-top:0}.media,.media-body{overflow:hidden;zoom:1}.media-body{width:10000px}.media-object{display:block}.media-object.img-thumbnail{max-width:none}.media-right,.media>.pull-right{padding-left:10px}.media-left,.media>.pull-left{padding-right:10px}.media-body,.media-left,.media-right{display:table-cell;vertical-align:top}.media-middle{vertical-align:middle}.media-bottom{vertical-align:bottom}.media-heading{margin-top:0;margin-bottom:5px}.media-list{padding-left:0;list-style:none}.list-group{padding-left:0;margin-bottom:20px}.list-group-item{position:relative;display:block;padding:10px 15px;margin-bottom:-1px;background-color:#fff;border:1px solid #ddd}.list-group-item:first-child{border-top-left-radius:4px;border-top-right-radius:4px}.list-group-item:last-child{margin-bottom:0;border-bottom-right-radius:4px;border-bottom-left-radius:4px}.list-group-item.disabled,.list-group-item.disabled:focus,.list-group-item.disabled:hover{color:#777;cursor:not-allowed;background-color:#eee}.list-group-item.disabled .list-group-item-heading,.list-group-item.disabled:focus .list-group-item-heading,.list-group-item.disabled:hover .list-group-item-heading{color:inherit}.list-group-item.disabled .list-group-item-text,.list-group-item.disabled:focus .list-group-item-text,.list-group-item.disabled:hover .list-group-item-text{color:#777}.list-group-item.active,.list-group-item.active:focus,.list-group-item.active:hover{z-index:2;color:#fff;background-color:#337ab7;border-color:#337ab7}.list-group-item.active .list-group-item-heading,.list-group-item.active .list-group-item-heading>.small,.list-group-item.active .list-group-item-heading>small,.list-group-item.active:focus .list-group-item-heading,.list-group-item.active:focus .list-group-item-heading>.small,.list-group-item.active:focus .list-group-item-heading>small,.list-group-item.active:hover .list-group-item-heading,.list-group-item.active:hover .list-group-item-heading>.small,.list-group-item.active:hover .list-group-item-heading>small{color:inherit}.list-group-item.active .list-group-item-text,.list-group-item.active:focus .list-group-item-text,.list-group-item.active:hover .list-group-item-text{color:#c7ddef}a.list-group-item,button.list-group-item{color:#555}a.list-group-item .list-group-item-heading,button.list-group-item .list-group-item-heading{color:#333}a.list-group-item:focus,a.list-group-item:hover,button.list-group-item:focus,button.list-group-item:hover{color:#555;text-decoration:none;background-color:#f5f5f5}button.list-group-item{width:100%;text-align:left}.list-group-item-success{color:#3c763d;background-color:#dff0d8}a.list-group-item-success,button.list-group-item-success{color:#3c763d}a.list-group-item-success .list-group-item-heading,button.list-group-item-success .list-group-item-heading{color:inherit}a.list-group-item-success:focus,a.list-group-item-success:hover,button.list-group-item-success:focus,button.list-group-item-success:hover{color:#3c763d;background-color:#d0e9c6}a.list-group-item-success.active,a.list-group-item-success.active:focus,a.list-group-item-success.active:hover,button.list-group-item-success.active,button.list-group-item-success.active:focus,button.list-group-item-success.active:hover{color:#fff;background-color:#3c763d;border-color:#3c763d}.list-group-item-info{color:#31708f;background-color:#d9edf7}a.list-group-item-info,button.list-group-item-info{color:#31708f}a.list-group-item-info .list-group-item-heading,button.list-group-item-info .list-group-item-heading{color:inherit}a.list-group-item-info:focus,a.list-group-item-info:hover,button.list-group-item-info:focus,button.list-group-item-info:hover{color:#31708f;background-color:#c4e3f3}a.list-group-item-info.active,a.list-group-item-info.active:focus,a.list-group-item-info.active:hover,button.list-group-item-info.active,button.list-group-item-info.active:focus,button.list-group-item-info.active:hover{color:#fff;background-color:#31708f;border-color:#31708f}.list-group-item-warning{color:#8a6d3b;background-color:#fcf8e3}a.list-group-item-warning,button.list-group-item-warning{color:#8a6d3b}a.list-group-item-warning .list-group-item-heading,button.list-group-item-warning .list-group-item-heading{color:inherit}a.list-group-item-warning:focus,a.list-group-item-warning:hover,button.list-group-item-warning:focus,button.list-group-item-warning:hover{color:#8a6d3b;background-color:#faf2cc}a.list-group-item-warning.active,a.list-group-item-warning.active:focus,a.list-group-item-warning.active:hover,button.list-group-item-warning.active,button.list-group-item-warning.active:focus,button.list-group-item-warning.active:hover{color:#fff;background-color:#8a6d3b;border-color:#8a6d3b}.list-group-item-danger{color:#a94442;background-color:#f2dede}a.list-group-item-danger,button.list-group-item-danger{color:#a94442}a.list-group-item-danger .list-group-item-heading,button.list-group-item-danger .list-group-item-heading{color:inherit}a.list-group-item-danger:focus,a.list-group-item-danger:hover,button.list-group-item-danger:focus,button.list-group-item-danger:hover{color:#a94442;background-color:#ebcccc}a.list-group-item-danger.active,a.list-group-item-danger.active:focus,a.list-group-item-danger.active:hover,button.list-group-item-danger.active,button.list-group-item-danger.active:focus,button.list-group-item-danger.active:hover{color:#fff;background-color:#a94442;border-color:#a94442}.list-group-item-heading{margin-top:0;margin-bottom:5px}.list-group-item-text{margin-bottom:0;line-height:1.3}.panel{margin-bottom:20px;background-color:#fff;border:1px solid transparent;border-radius:4px;-webkit-box-shadow:0 1px 1px rgba(0,0,0,.05);box-shadow:0 1px 1px rgba(0,0,0,.05)}.panel-body{padding:15px}.panel-heading{padding:10px 15px;border-bottom:1px solid transparent;border-top-left-radius:3px;border-top-right-radius:3px}.panel-heading>.dropdown .dropdown-toggle{color:inherit}.panel-title{margin-top:0;margin-bottom:0;font-size:16px;color:inherit}.panel-title>.small,.panel-title>.small>a,.panel-title>a,.panel-title>small,.panel-title>small>a{color:inherit}.panel-footer{padding:10px 15px;background-color:#f5f5f5;border-top:1px solid #ddd;border-bottom-right-radius:3px;border-bottom-left-radius:3px}.panel>.list-group,.panel>.panel-collapse>.list-group{margin-bottom:0}.panel>.list-group .list-group-item,.panel>.panel-collapse>.list-group .list-group-item{border-width:1px 0;border-radius:0}.panel>.list-group:first-child .list-group-item:first-child,.panel>.panel-collapse>.list-group:first-child .list-group-item:first-child{border-top:0;border-top-left-radius:3px;border-top-right-radius:3px}.panel>.list-group:last-child .list-group-item:last-child,.panel>.panel-collapse>.list-group:last-child .list-group-item:last-child{border-bottom:0;border-bottom-right-radius:3px;border-bottom-left-radius:3px}.panel>.panel-heading+.panel-collapse>.list-group .list-group-item:first-child{border-top-left-radius:0;border-top-right-radius:0}.panel-heading+.list-group .list-group-item:first-child{border-top-width:0}.list-group+.panel-footer{border-top-width:0}.panel>.panel-collapse>.table,.panel>.table,.panel>.table-responsive>.table{margin-bottom:0}.panel>.panel-collapse>.table caption,.panel>.table caption,.panel>.table-responsive>.table caption{padding-right:15px;padding-left:15px}.panel>.table-responsive:first-child>.table:first-child,.panel>.table:first-child{border-top-left-radius:3px;border-top-right-radius:3px}.panel>.table-responsive:first-child>.table:first-child>tbody:first-child>tr:first-child,.panel>.table-responsive:first-child>.table:first-child>thead:first-child>tr:first-child,.panel>.table:first-child>tbody:first-child>tr:first-child,.panel>.table:first-child>thead:first-child>tr:first-child{border-top-left-radius:3px;border-top-right-radius:3px}.panel>.table-responsive:first-child>.table:first-child>tbody:first-child>tr:first-child td:first-child,.panel>.table-responsive:first-child>.table:first-child>tbody:first-child>tr:first-child th:first-child,.panel>.table-responsive:first-child>.table:first-child>thead:first-child>tr:first-child td:first-child,.panel>.table-responsive:first-child>.table:first-child>thead:first-child>tr:first-child th:first-child,.panel>.table:first-child>tbody:first-child>tr:first-child td:first-child,.panel>.table:first-child>tbody:first-child>tr:first-child th:first-child,.panel>.table:first-child>thead:first-child>tr:first-child td:first-child,.panel>.table:first-child>thead:first-child>tr:first-child th:first-child{border-top-left-radius:3px}.panel>.table-responsive:first-child>.table:first-child>tbody:first-child>tr:first-child td:last-child,.panel>.table-responsive:first-child>.table:first-child>tbody:first-child>tr:first-child th:last-child,.panel>.table-responsive:first-child>.table:first-child>thead:first-child>tr:first-child td:last-child,.panel>.table-responsive:first-child>.table:first-child>thead:first-child>tr:first-child th:last-child,.panel>.table:first-child>tbody:first-child>tr:first-child td:last-child,.panel>.table:first-child>tbody:first-child>tr:first-child th:last-child,.panel>.table:first-child>thead:first-child>tr:first-child td:last-child,.panel>.table:first-child>thead:first-child>tr:first-child th:last-child{border-top-right-radius:3px}.panel>.table-responsive:last-child>.table:last-child,.panel>.table:last-child{border-bottom-right-radius:3px;border-bottom-left-radius:3px}.panel>.table-responsive:last-child>.table:last-child>tbody:last-child>tr:last-child,.panel>.table-responsive:last-child>.table:last-child>tfoot:last-child>tr:last-child,.panel>.table:last-child>tbody:last-child>tr:last-child,.panel>.table:last-child>tfoot:last-child>tr:last-child{border-bottom-right-radius:3px;border-bottom-left-radius:3px}.panel>.table-responsive:last-child>.table:last-child>tbody:last-child>tr:last-child td:first-child,.panel>.table-responsive:last-child>.table:last-child>tbody:last-child>tr:last-child th:first-child,.panel>.table-responsive:last-child>.table:last-child>tfoot:last-child>tr:last-child td:first-child,.panel>.table-responsive:last-child>.table:last-child>tfoot:last-child>tr:last-child th:first-child,.panel>.table:last-child>tbody:last-child>tr:last-child td:first-child,.panel>.table:last-child>tbody:last-child>tr:last-child th:first-child,.panel>.table:last-child>tfoot:last-child>tr:last-child td:first-child,.panel>.table:last-child>tfoot:last-child>tr:last-child th:first-child{border-bottom-left-radius:3px}.panel>.table-responsive:last-child>.table:last-child>tbody:last-child>tr:last-child td:last-child,.panel>.table-responsive:last-child>.table:last-child>tbody:last-child>tr:last-child th:last-child,.panel>.table-responsive:last-child>.table:last-child>tfoot:last-child>tr:last-child td:last-child,.panel>.table-responsive:last-child>.table:last-child>tfoot:last-child>tr:last-child th:last-child,.panel>.table:last-child>tbody:last-child>tr:last-child td:last-child,.panel>.table:last-child>tbody:last-child>tr:last-child th:last-child,.panel>.table:last-child>tfoot:last-child>tr:last-child td:last-child,.panel>.table:last-child>tfoot:last-child>tr:last-child th:last-child{border-bottom-right-radius:3px}.panel>.panel-body+.table,.panel>.panel-body+.table-responsive,.panel>.table+.panel-body,.panel>.table-responsive+.panel-body{border-top:1px solid #ddd}.panel>.table>tbody:first-child>tr:first-child td,.panel>.table>tbody:first-child>tr:first-child th{border-top:0}.panel>.table-bordered,.panel>.table-responsive>.table-bordered{border:0}.panel>.table-bordered>tbody>tr>td:first-child,.panel>.table-bordered>tbody>tr>th:first-child,.panel>.table-bordered>tfoot>tr>td:first-child,.panel>.table-bordered>tfoot>tr>th:first-child,.panel>.table-bordered>thead>tr>td:first-child,.panel>.table-bordered>thead>tr>th:first-child,.panel>.table-responsive>.table-bordered>tbody>tr>td:first-child,.panel>.table-responsive>.table-bordered>tbody>tr>th:first-child,.panel>.table-responsive>.table-bordered>tfoot>tr>td:first-child,.panel>.table-responsive>.table-bordered>tfoot>tr>th:first-child,.panel>.table-responsive>.table-bordered>thead>tr>td:first-child,.panel>.table-responsive>.table-bordered>thead>tr>th:first-child{border-left:0}.panel>.table-bordered>tbody>tr>td:last-child,.panel>.table-bordered>tbody>tr>th:last-child,.panel>.table-bordered>tfoot>tr>td:last-child,.panel>.table-bordered>tfoot>tr>th:last-child,.panel>.table-bordered>thead>tr>td:last-child,.panel>.table-bordered>thead>tr>th:last-child,.panel>.table-responsive>.table-bordered>tbody>tr>td:last-child,.panel>.table-responsive>.table-bordered>tbody>tr>th:last-child,.panel>.table-responsive>.table-bordered>tfoot>tr>td:last-child,.panel>.table-responsive>.table-bordered>tfoot>tr>th:last-child,.panel>.table-responsive>.table-bordered>thead>tr>td:last-child,.panel>.table-responsive>.table-bordered>thead>tr>th:last-child{border-right:0}.panel>.table-bordered>tbody>tr:first-child>td,.panel>.table-bordered>tbody>tr:first-child>th,.panel>.table-bordered>thead>tr:first-child>td,.panel>.table-bordered>thead>tr:first-child>th,.panel>.table-responsive>.table-bordered>tbody>tr:first-child>td,.panel>.table-responsive>.table-bordered>tbody>tr:first-child>th,.panel>.table-responsive>.table-bordered>thead>tr:first-child>td,.panel>.table-responsive>.table-bordered>thead>tr:first-child>th{border-bottom:0}.panel>.table-bordered>tbody>tr:last-child>td,.panel>.table-bordered>tbody>tr:last-child>th,.panel>.table-bordered>tfoot>tr:last-child>td,.panel>.table-bordered>tfoot>tr:last-child>th,.panel>.table-responsive>.table-bordered>tbody>tr:last-child>td,.panel>.table-responsive>.table-bordered>tbody>tr:last-child>th,.panel>.table-responsive>.table-bordered>tfoot>tr:last-child>td,.panel>.table-responsive>.table-bordered>tfoot>tr:last-child>th{border-bottom:0}.panel>.table-responsive{margin-bottom:0;border:0}.panel-group{margin-bottom:20px}.panel-group .panel{margin-bottom:0;border-radius:4px}.panel-group .panel+.panel{margin-top:5px}.panel-group .panel-heading{border-bottom:0}.panel-group .panel-heading+.panel-collapse>.list-group,.panel-group .panel-heading+.panel-collapse>.panel-body{border-top:1px solid #ddd}.panel-group .panel-footer{border-top:0}.panel-group .panel-footer+.panel-collapse .panel-body{border-bottom:1px solid #ddd}.panel-default{border-color:#ddd}.panel-default>.panel-heading{color:#333;background-color:#f5f5f5;border-color:#ddd}.panel-default>.panel-heading+.panel-collapse>.panel-body{border-top-color:#ddd}.panel-default>.panel-heading .badge{color:#f5f5f5;background-color:#333}.panel-default>.panel-footer+.panel-collapse>.panel-body{border-bottom-color:#ddd}.panel-primary{border-color:#337ab7}.panel-primary>.panel-heading{color:#fff;background-color:#337ab7;border-color:#337ab7}.panel-primary>.panel-heading+.panel-collapse>.panel-body{border-top-color:#337ab7}.panel-primary>.panel-heading .badge{color:#337ab7;background-color:#fff}.panel-primary>.panel-footer+.panel-collapse>.panel-body{border-bottom-color:#337ab7}.panel-success{border-color:#d6e9c6}.panel-success>.panel-heading{color:#3c763d;background-color:#dff0d8;border-color:#d6e9c6}.panel-success>.panel-heading+.panel-collapse>.panel-body{border-top-color:#d6e9c6}.panel-success>.panel-heading .badge{color:#dff0d8;background-color:#3c763d}.panel-success>.panel-footer+.panel-collapse>.panel-body{border-bottom-color:#d6e9c6}.panel-info{border-color:#bce8f1}.panel-info>.panel-heading{color:#31708f;background-color:#d9edf7;border-color:#bce8f1}.panel-info>.panel-heading+.panel-collapse>.panel-body{border-top-color:#bce8f1}.panel-info>.panel-heading .badge{color:#d9edf7;background-color:#31708f}.panel-info>.panel-footer+.panel-collapse>.panel-body{border-bottom-color:#bce8f1}.panel-warning{border-color:#faebcc}.panel-warning>.panel-heading{color:#8a6d3b;background-color:#fcf8e3;border-color:#faebcc}.panel-warning>.panel-heading+.panel-collapse>.panel-body{border-top-color:#faebcc}.panel-warning>.panel-heading .badge{color:#fcf8e3;background-color:#8a6d3b}.panel-warning>.panel-footer+.panel-collapse>.panel-body{border-bottom-color:#faebcc}.panel-danger{border-color:#ebccd1}.panel-danger>.panel-heading{color:#a94442;background-color:#f2dede;border-color:#ebccd1}.panel-danger>.panel-heading+.panel-collapse>.panel-body{border-top-color:#ebccd1}.panel-danger>.panel-heading .badge{color:#f2dede;background-color:#a94442}.panel-danger>.panel-footer+.panel-collapse>.panel-body{border-bottom-color:#ebccd1}.embed-responsive{position:relative;display:block;height:0;padding:0;overflow:hidden}.embed-responsive .embed-responsive-item,.embed-responsive embed,.embed-responsive iframe,.embed-responsive object,.embed-responsive video{position:absolute;top:0;bottom:0;left:0;width:100%;height:100%;border:0}.embed-responsive-16by9{padding-bottom:56.25%}.embed-responsive-4by3{padding-bottom:75%}.well{min-height:20px;padding:19px;margin-bottom:20px;background-color:#f5f5f5;border:1px solid #e3e3e3;border-radius:4px;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.05);box-shadow:inset 0 1px 1px rgba(0,0,0,.05)}.well blockquote{border-color:#ddd;border-color:rgba(0,0,0,.15)}.well-lg{padding:24px;border-radius:6px}.well-sm{padding:9px;border-radius:3px}.close{float:right;font-size:21px;font-weight:700;line-height:1;color:#000;text-shadow:0 1px 0 #fff;filter:alpha(opacity=20);opacity:.2}.close:focus,.close:hover{color:#000;text-decoration:none;cursor:pointer;filter:alpha(opacity=50);opacity:.5}button.close{padding:0;cursor:pointer;background:0 0;border:0;-webkit-appearance:none;-moz-appearance:none;appearance:none}.modal-open{overflow:hidden}.modal{position:fixed;top:0;right:0;bottom:0;left:0;z-index:1050;display:none;overflow:hidden;-webkit-overflow-scrolling:touch;outline:0}.modal.fade .modal-dialog{-webkit-transform:translate(0,-25%);-ms-transform:translate(0,-25%);-o-transform:translate(0,-25%);transform:translate(0,-25%);-webkit-transition:-webkit-transform .3s ease-out;-o-transition:-o-transform .3s ease-out;transition:-webkit-transform .3s ease-out;transition:transform .3s ease-out;transition:transform .3s ease-out,-webkit-transform .3s ease-out,-o-transform .3s ease-out}.modal.in .modal-dialog{-webkit-transform:translate(0,0);-ms-transform:translate(0,0);-o-transform:translate(0,0);transform:translate(0,0)}.modal-open .modal{overflow-x:hidden;overflow-y:auto}.modal-dialog{position:relative;width:auto;margin:10px}.modal-content{position:relative;background-color:#fff;background-clip:padding-box;border:1px solid #999;border:1px solid rgba(0,0,0,.2);border-radius:6px;-webkit-box-shadow:0 3px 9px rgba(0,0,0,.5);box-shadow:0 3px 9px rgba(0,0,0,.5);outline:0}.modal-backdrop{position:fixed;top:0;right:0;bottom:0;left:0;z-index:1040;background-color:#000}.modal-backdrop.fade{filter:alpha(opacity=0);opacity:0}.modal-backdrop.in{filter:alpha(opacity=50);opacity:.5}.modal-header{padding:15px;border-bottom:1px solid #e5e5e5}.modal-header .close{margin-top:-2px}.modal-title{margin:0;line-height:1.42857143}.modal-body{position:relative;padding:15px}.modal-footer{padding:15px;text-align:right;border-top:1px solid #e5e5e5}.modal-footer .btn+.btn{margin-bottom:0;margin-left:5px}.modal-footer .btn-group .btn+.btn{margin-left:-1px}.modal-footer .btn-block+.btn-block{margin-left:0}.modal-scrollbar-measure{position:absolute;top:-9999px;width:50px;height:50px;overflow:scroll}@media (min-width:768px){.modal-dialog{width:600px;margin:30px auto}.modal-content{-webkit-box-shadow:0 5px 15px rgba(0,0,0,.5);box-shadow:0 5px 15px rgba(0,0,0,.5)}.modal-sm{width:300px}}@media (min-width:992px){.modal-lg{width:900px}}.tooltip{position:absolute;z-index:1070;display:block;font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-style:normal;font-weight:400;line-height:1.42857143;line-break:auto;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;letter-spacing:normal;word-break:normal;word-spacing:normal;word-wrap:normal;white-space:normal;font-size:12px;filter:alpha(opacity=0);opacity:0}.tooltip.in{filter:alpha(opacity=90);opacity:.9}.tooltip.top{padding:5px 0;margin-top:-3px}.tooltip.right{padding:0 5px;margin-left:3px}.tooltip.bottom{padding:5px 0;margin-top:3px}.tooltip.left{padding:0 5px;margin-left:-3px}.tooltip.top .tooltip-arrow{bottom:0;left:50%;margin-left:-5px;border-width:5px 5px 0;border-top-color:#000}.tooltip.top-left .tooltip-arrow{right:5px;bottom:0;margin-bottom:-5px;border-width:5px 5px 0;border-top-color:#000}.tooltip.top-right .tooltip-arrow{bottom:0;left:5px;margin-bottom:-5px;border-width:5px 5px 0;border-top-color:#000}.tooltip.right .tooltip-arrow{top:50%;left:0;margin-top:-5px;border-width:5px 5px 5px 0;border-right-color:#000}.tooltip.left .tooltip-arrow{top:50%;right:0;margin-top:-5px;border-width:5px 0 5px 5px;border-left-color:#000}.tooltip.bottom .tooltip-arrow{top:0;left:50%;margin-left:-5px;border-width:0 5px 5px;border-bottom-color:#000}.tooltip.bottom-left .tooltip-arrow{top:0;right:5px;margin-top:-5px;border-width:0 5px 5px;border-bottom-color:#000}.tooltip.bottom-right .tooltip-arrow{top:0;left:5px;margin-top:-5px;border-width:0 5px 5px;border-bottom-color:#000}.tooltip-inner{max-width:200px;padding:3px 8px;color:#fff;text-align:center;background-color:#000;border-radius:4px}.tooltip-arrow{position:absolute;width:0;height:0;border-color:transparent;border-style:solid}.popover{position:absolute;top:0;left:0;z-index:1060;display:none;max-width:276px;padding:1px;font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-style:normal;font-weight:400;line-height:1.42857143;line-break:auto;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;letter-spacing:normal;word-break:normal;word-spacing:normal;word-wrap:normal;white-space:normal;font-size:14px;background-color:#fff;background-clip:padding-box;border:1px solid #ccc;border:1px solid rgba(0,0,0,.2);border-radius:6px;-webkit-box-shadow:0 5px 10px rgba(0,0,0,.2);box-shadow:0 5px 10px rgba(0,0,0,.2)}.popover.top{margin-top:-10px}.popover.right{margin-left:10px}.popover.bottom{margin-top:10px}.popover.left{margin-left:-10px}.popover>.arrow{border-width:11px}.popover>.arrow,.popover>.arrow:after{position:absolute;display:block;width:0;height:0;border-color:transparent;border-style:solid}.popover>.arrow:after{content:"";border-width:10px}.popover.top>.arrow{bottom:-11px;left:50%;margin-left:-11px;border-top-color:#999;border-top-color:rgba(0,0,0,.25);border-bottom-width:0}.popover.top>.arrow:after{bottom:1px;margin-left:-10px;content:" ";border-top-color:#fff;border-bottom-width:0}.popover.right>.arrow{top:50%;left:-11px;margin-top:-11px;border-right-color:#999;border-right-color:rgba(0,0,0,.25);border-left-width:0}.popover.right>.arrow:after{bottom:-10px;left:1px;content:" ";border-right-color:#fff;border-left-width:0}.popover.bottom>.arrow{top:-11px;left:50%;margin-left:-11px;border-top-width:0;border-bottom-color:#999;border-bottom-color:rgba(0,0,0,.25)}.popover.bottom>.arrow:after{top:1px;margin-left:-10px;content:" ";border-top-width:0;border-bottom-color:#fff}.popover.left>.arrow{top:50%;right:-11px;margin-top:-11px;border-right-width:0;border-left-color:#999;border-left-color:rgba(0,0,0,.25)}.popover.left>.arrow:after{right:1px;bottom:-10px;content:" ";border-right-width:0;border-left-color:#fff}.popover-title{padding:8px 14px;margin:0;font-size:14px;background-color:#f7f7f7;border-bottom:1px solid #ebebeb;border-radius:5px 5px 0 0}.popover-content{padding:9px 14px}.carousel{position:relative}.carousel-inner{position:relative;width:100%;overflow:hidden}.carousel-inner>.item{position:relative;display:none;-webkit-transition:.6s ease-in-out left;-o-transition:.6s ease-in-out left;transition:.6s ease-in-out left}.carousel-inner>.item>a>img,.carousel-inner>.item>img{line-height:1}@media all and (transform-3d),(-webkit-transform-3d){.carousel-inner>.item{-webkit-transition:-webkit-transform .6s ease-in-out;-o-transition:-o-transform .6s ease-in-out;transition:-webkit-transform .6s ease-in-out;transition:transform .6s ease-in-out;transition:transform .6s ease-in-out,-webkit-transform .6s ease-in-out,-o-transform .6s ease-in-out;-webkit-backface-visibility:hidden;backface-visibility:hidden;-webkit-perspective:1000px;perspective:1000px}.carousel-inner>.item.active.right,.carousel-inner>.item.next{-webkit-transform:translate3d(100%,0,0);transform:translate3d(100%,0,0);left:0}.carousel-inner>.item.active.left,.carousel-inner>.item.prev{-webkit-transform:translate3d(-100%,0,0);transform:translate3d(-100%,0,0);left:0}.carousel-inner>.item.active,.carousel-inner>.item.next.left,.carousel-inner>.item.prev.right{-webkit-transform:translate3d(0,0,0);transform:translate3d(0,0,0);left:0}}.carousel-inner>.active,.carousel-inner>.next,.carousel-inner>.prev{display:block}.carousel-inner>.active{left:0}.carousel-inner>.next,.carousel-inner>.prev{position:absolute;top:0;width:100%}.carousel-inner>.next{left:100%}.carousel-inner>.prev{left:-100%}.carousel-inner>.next.left,.carousel-inner>.prev.right{left:0}.carousel-inner>.active.left{left:-100%}.carousel-inner>.active.right{left:100%}.carousel-control{position:absolute;top:0;bottom:0;left:0;width:15%;font-size:20px;color:#fff;text-align:center;text-shadow:0 1px 2px rgba(0,0,0,.6);background-color:rgba(0,0,0,0);filter:alpha(opacity=50);opacity:.5}.carousel-control.left{background-image:-webkit-linear-gradient(left,rgba(0,0,0,.5) 0,rgba(0,0,0,.0001) 100%);background-image:-o-linear-gradient(left,rgba(0,0,0,.5) 0,rgba(0,0,0,.0001) 100%);background-image:-webkit-gradient(linear,left top,right top,from(rgba(0,0,0,.5)),to(rgba(0,0,0,.0001)));background-image:linear-gradient(to right,rgba(0,0,0,.5) 0,rgba(0,0,0,.0001) 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#80000000', endColorstr='#00000000', GradientType=1);background-repeat:repeat-x}.carousel-control.right{right:0;left:auto;background-image:-webkit-linear-gradient(left,rgba(0,0,0,.0001) 0,rgba(0,0,0,.5) 100%);background-image:-o-linear-gradient(left,rgba(0,0,0,.0001) 0,rgba(0,0,0,.5) 100%);background-image:-webkit-gradient(linear,left top,right top,from(rgba(0,0,0,.0001)),to(rgba(0,0,0,.5)));background-image:linear-gradient(to right,rgba(0,0,0,.0001) 0,rgba(0,0,0,.5) 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#00000000', endColorstr='#80000000', GradientType=1);background-repeat:repeat-x}.carousel-control:focus,.carousel-control:hover{color:#fff;text-decoration:none;outline:0;filter:alpha(opacity=90);opacity:.9}.carousel-control .glyphicon-chevron-left,.carousel-control .glyphicon-chevron-right,.carousel-control .icon-next,.carousel-control .icon-prev{position:absolute;top:50%;z-index:5;display:inline-block;margin-top:-10px}.carousel-control .glyphicon-chevron-left,.carousel-control .icon-prev{left:50%;margin-left:-10px}.carousel-control .glyphicon-chevron-right,.carousel-control .icon-next{right:50%;margin-right:-10px}.carousel-control .icon-next,.carousel-control .icon-prev{width:20px;height:20px;font-family:serif;line-height:1}.carousel-control .icon-prev:before{content:"\2039"}.carousel-control .icon-next:before{content:"\203a"}.carousel-indicators{position:absolute;bottom:10px;left:50%;z-index:15;width:60%;padding-left:0;margin-left:-30%;text-align:center;list-style:none}.carousel-indicators li{display:inline-block;width:10px;height:10px;margin:1px;text-indent:-999px;cursor:pointer;background-color:#000\9;background-color:rgba(0,0,0,0);border:1px solid #fff;border-radius:10px}.carousel-indicators .active{width:12px;height:12px;margin:0;background-color:#fff}.carousel-caption{position:absolute;right:15%;bottom:20px;left:15%;z-index:10;padding-top:20px;padding-bottom:20px;color:#fff;text-align:center;text-shadow:0 1px 2px rgba(0,0,0,.6)}.carousel-caption .btn{text-shadow:none}@media screen and (min-width:768px){.carousel-control .glyphicon-chevron-left,.carousel-control .glyphicon-chevron-right,.carousel-control .icon-next,.carousel-control .icon-prev{width:30px;height:30px;margin-top:-10px;font-size:30px}.carousel-control .glyphicon-chevron-left,.carousel-control .icon-prev{margin-left:-10px}.carousel-control .glyphicon-chevron-right,.carousel-control .icon-next{margin-right:-10px}.carousel-caption{right:20%;left:20%;padding-bottom:30px}.carousel-indicators{bottom:20px}}.btn-group-vertical>.btn-group:after,.btn-group-vertical>.btn-group:before,.btn-toolbar:after,.btn-toolbar:before,.clearfix:after,.clearfix:before,.container-fluid:after,.container-fluid:before,.container:after,.container:before,.dl-horizontal dd:after,.dl-horizontal dd:before,.form-horizontal .form-group:after,.form-horizontal .form-group:before,.modal-footer:after,.modal-footer:before,.modal-header:after,.modal-header:before,.nav:after,.nav:before,.navbar-collapse:after,.navbar-collapse:before,.navbar-header:after,.navbar-header:before,.navbar:after,.navbar:before,.pager:after,.pager:before,.panel-body:after,.panel-body:before,.row:after,.row:before{display:table;content:" "}.btn-group-vertical>.btn-group:after,.btn-toolbar:after,.clearfix:after,.container-fluid:after,.container:after,.dl-horizontal dd:after,.form-horizontal .form-group:after,.modal-footer:after,.modal-header:after,.nav:after,.navbar-collapse:after,.navbar-header:after,.navbar:after,.pager:after,.panel-body:after,.row:after{clear:both}.center-block{display:block;margin-right:auto;margin-left:auto}.pull-right{float:right!important}.pull-left{float:left!important}.hide{display:none!important}.show{display:block!important}.invisible{visibility:hidden}.text-hide{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.hidden{display:none!important}.affix{position:fixed}@-ms-viewport{width:device-width}.visible-lg,.visible-md,.visible-sm,.visible-xs{display:none!important}.visible-lg-block,.visible-lg-inline,.visible-lg-inline-block,.visible-md-block,.visible-md-inline,.visible-md-inline-block,.visible-sm-block,.visible-sm-inline,.visible-sm-inline-block,.visible-xs-block,.visible-xs-inline,.visible-xs-inline-block{display:none!important}@media (max-width:767px){.visible-xs{display:block!important}table.visible-xs{display:table!important}tr.visible-xs{display:table-row!important}td.visible-xs,th.visible-xs{display:table-cell!important}}@media (max-width:767px){.visible-xs-block{display:block!important}}@media (max-width:767px){.visible-xs-inline{display:inline!important}}@media (max-width:767px){.visible-xs-inline-block{display:inline-block!important}}@media (min-width:768px) and (max-width:991px){.visible-sm{display:block!important}table.visible-sm{display:table!important}tr.visible-sm{display:table-row!important}td.visible-sm,th.visible-sm{display:table-cell!important}}@media (min-width:768px) and (max-width:991px){.visible-sm-block{display:block!important}}@media (min-width:768px) and (max-width:991px){.visible-sm-inline{display:inline!important}}@media (min-width:768px) and (max-width:991px){.visible-sm-inline-block{display:inline-block!important}}@media (min-width:992px) and (max-width:1199px){.visible-md{display:block!important}table.visible-md{display:table!important}tr.visible-md{display:table-row!important}td.visible-md,th.visible-md{display:table-cell!important}}@media (min-width:992px) and (max-width:1199px){.visible-md-block{display:block!important}}@media (min-width:992px) and (max-width:1199px){.visible-md-inline{display:inline!important}}@media (min-width:992px) and (max-width:1199px){.visible-md-inline-block{display:inline-block!important}}@media (min-width:1200px){.visible-lg{display:block!important}table.visible-lg{display:table!important}tr.visible-lg{display:table-row!important}td.visible-lg,th.visible-lg{display:table-cell!important}}@media (min-width:1200px){.visible-lg-block{display:block!important}}@media (min-width:1200px){.visible-lg-inline{display:inline!important}}@media (min-width:1200px){.visible-lg-inline-block{display:inline-block!important}}@media (max-width:767px){.hidden-xs{display:none!important}}@media (min-width:768px) and (max-width:991px){.hidden-sm{display:none!important}}@media (min-width:992px) and (max-width:1199px){.hidden-md{display:none!important}}@media (min-width:1200px){.hidden-lg{display:none!important}}.visible-print{display:none!important}@media print{.visible-print{display:block!important}table.visible-print{display:table!important}tr.visible-print{display:table-row!important}td.visible-print,th.visible-print{display:table-cell!important}}.visible-print-block{display:none!important}@media print{.visible-print-block{display:block!important}}.visible-print-inline{display:none!important}@media print{.visible-print-inline{display:inline!important}}.visible-print-inline-block{display:none!important}@media print{.visible-print-inline-block{display:inline-block!important}}@media print{.hidden-print{display:none!important}} + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+ +

Connecting...

+
+ +
+ +
+ +
+
+ + + + + + +
+
+ +
+ +
+
+
+
+
+
+ +
+
+ + +
+
+ + +
+
+
+
Avoid sending privacy sensitive information. Click here for more info.
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/koboldcpp.py b/koboldcpp.py index ab3ce85ec959e971f510dc9a51eebab335a4471b..67fa9544142c08ff267069c2c5d0389bdf780583 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -61,8 +61,10 @@ class generation_inputs(ctypes.Structure): ("mirostat_eta", ctypes.c_float), ("sampler_order", ctypes.c_int * sampler_order_max), ("sampler_len", ctypes.c_int), + ("unban_tokens_rt", ctypes.c_bool), ("stop_sequence", ctypes.c_char_p * stop_token_max), - ("stream_sse", ctypes.c_bool)] + ("stream_sse", ctypes.c_bool), + ("grammar", ctypes.c_char_p)] class generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), @@ -72,37 +74,48 @@ handle = None def getdirpath(): return os.path.dirname(os.path.realpath(__file__)) +def getabspath(): + return os.path.dirname(os.path.abspath(__file__)) def file_exists(filename): return os.path.exists(os.path.join(getdirpath(), filename)) def pick_existant_file(ntoption,nonntoption): + precompiled_prefix = "precompiled_" ntexist = file_exists(ntoption) nonntexist = file_exists(nonntoption) + precompiled_ntexist = file_exists(precompiled_prefix+ntoption) + precompiled_nonntexist = file_exists(precompiled_prefix+nonntoption) if os.name == 'nt': + if not ntexist and precompiled_ntexist: + return (precompiled_prefix+ntoption) if nonntexist and not ntexist: return nonntoption return ntoption else: + if not nonntexist and precompiled_nonntexist: + return (precompiled_prefix+nonntoption) if ntexist and not nonntexist: return ntoption return nonntoption -lib_default = pick_existant_file("koboldcpp.dll","koboldcpp.so") +lib_default = pick_existant_file("koboldcpp_default.dll","koboldcpp_default.so") lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so") lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so") lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so") lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so") lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so") +lib_hipblas = pick_existant_file("koboldcpp_hipblas.dll","koboldcpp_hipblas.so") def init_library(): - global handle + global handle, args global lib_default,lib_failsafe,lib_openblas,lib_noavx2,lib_clblast,lib_cublas libname = "" use_openblas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir. use_clblast = False #uses CLBlast instead use_cublas = False #uses cublas instead + use_hipblas = False #uses hipblas instead use_noavx2 = False #uses no avx2 instructions use_failsafe = False #uses no intrinsics, failsafe mode if args.noavx2: @@ -121,11 +134,16 @@ def init_library(): print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.") use_clblast = True elif (args.usecublas is not None): - if not file_exists(lib_cublas): + if not file_exists(lib_cublas) and not file_exists(lib_hipblas): print("Warning: CuBLAS library file not found. Non-BLAS library will be used.") else: - print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.") - use_cublas = True + if file_exists(lib_cublas): + print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.") + use_cublas = True + elif file_exists(lib_hipblas): + print("Attempting to use hipBLAS library for faster prompt ingestion. A compatible AMD GPU will be required.") + use_hipblas = True + else: if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")): print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.") @@ -147,6 +165,8 @@ def init_library(): libname = lib_clblast elif use_cublas: libname = lib_cublas + elif use_hipblas: + libname = lib_hipblas elif use_openblas: libname = lib_openblas else: @@ -154,8 +174,13 @@ def init_library(): print("Initializing dynamic library: " + libname) dir_path = getdirpath() + abs_path = getabspath() - #OpenBLAS should provide about a 2x speedup on prompt ingestion if compatible. + #add all potential paths + if os.name=='nt': + os.add_dll_directory(dir_path) + os.add_dll_directory(abs_path) + os.add_dll_directory(os.getcwd()) handle = ctypes.CDLL(os.path.join(dir_path, libname)) handle.load_model.argtypes = [load_model_inputs] @@ -171,9 +196,11 @@ def init_library(): handle.get_last_token_count.restype = ctypes.c_int handle.get_last_stop_reason.restype = ctypes.c_int handle.abort_generate.restype = ctypes.c_bool + handle.token_count.restype = ctypes.c_int handle.get_pending_output.restype = ctypes.c_char_p def load_model(model_filename): + global args inputs = load_model_inputs() inputs.model_filename = model_filename.encode("UTF-8") inputs.batch_size = 8 @@ -206,13 +233,6 @@ def load_model(model_filename): if args.useclblast: clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1]) inputs.clblast_info = clblastids - inputs.cublas_info = 0 - if (args.usecublas and "0" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "0" - elif (args.usecublas and "1" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "1" - elif (args.usecublas and "2" in args.usecublas): - os.environ["CUDA_VISIBLE_DEVICES"] = "2" for n in range(tensor_split_max): if args.tensor_split and n < len(args.tensor_split): @@ -220,6 +240,33 @@ def load_model(model_filename): else: inputs.tensor_split[n] = 0 + # we must force an explicit tensor split + # otherwise the default will divide equally and multigpu crap will slow it down badly + inputs.cublas_info = 0 + + if not args.tensor_split: + if (args.usecublas and "0" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + os.environ["HIP_VISIBLE_DEVICES"] = "0" + elif (args.usecublas and "1" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "1" + os.environ["HIP_VISIBLE_DEVICES"] = "1" + elif (args.usecublas and "2" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "2" + os.environ["HIP_VISIBLE_DEVICES"] = "2" + elif (args.usecublas and "3" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "3" + os.environ["HIP_VISIBLE_DEVICES"] = "3" + else: + if (args.usecublas and "0" in args.usecublas): + inputs.cublas_info = 0 + elif (args.usecublas and "1" in args.usecublas): + inputs.cublas_info = 1 + elif (args.usecublas and "2" in args.usecublas): + inputs.cublas_info = 2 + elif (args.usecublas and "3" in args.usecublas): + inputs.cublas_info = 3 + inputs.executable_path = (getdirpath()+"/").encode("UTF-8") inputs.debugmode = args.debugmode banned_tokens = args.bantokens @@ -231,8 +278,8 @@ def load_model(model_filename): ret = handle.load_model(inputs) return ret -def generate(prompt,max_length=20, max_context_length=512, temperature=0.8, top_k=120, top_a=0.0, top_p=0.85, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], stream_sse=False): - global maxctx +def generate(prompt,max_length=20, max_context_length=512, temperature=0.8, top_k=120, top_a=0.0, top_p=0.85, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=True, stream_sse=False, grammar=''): + global maxctx, args inputs = generation_inputs() outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs)) inputs.prompt = prompt.encode("UTF-8") @@ -253,6 +300,8 @@ def generate(prompt,max_length=20, max_context_length=512, temperature=0.8, top_ inputs.rep_pen = rep_pen inputs.rep_pen_range = rep_pen_range inputs.stream_sse = stream_sse + inputs.grammar = grammar.encode("UTF-8") + inputs.unban_tokens_rt = not use_default_badwordsids if args.usemirostat and args.usemirostat[0]>0: inputs.mirostat = int(args.usemirostat[0]) inputs.mirostat_tau = float(args.usemirostat[1]) @@ -291,6 +340,7 @@ def utfprint(str): except UnicodeEncodeError: # Replace or omit the problematic character utf_string = str.encode('ascii', 'ignore').decode('ascii') + utf_string = utf_string.replace('\a', '') #remove bell characters print(utf_string) ################################################################# @@ -302,12 +352,14 @@ maxctx = 2048 maxhordectx = 1024 maxhordelen = 256 modelbusy = threading.Lock() +requestsinqueue = 0 defaultport = 5001 -KcppVersion = "1.40.1" +KcppVersion = "1.44.2" showdebug = True showsamplerwarning = True showmaxctxwarning = True exitcounter = 0 +args = None #global args class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" @@ -348,7 +400,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sampler_order=genparams.get('sampler_order', [6,0,1,3,4,2,5]), seed=genparams.get('sampler_seed', -1), stop_sequence=genparams.get('stop_sequence', []), - stream_sse=stream_flag) + use_default_badwordsids=genparams.get('use_default_badwordsids', True), + stream_sse=stream_flag, + grammar=genparams.get('grammar', '')) else: return generate(prompt=newprompt, @@ -368,7 +422,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sampler_order=genparams.get('sampler_order', [6,0,1,3,4,2,5]), seed=genparams.get('sampler_seed', -1), stop_sequence=genparams.get('stop_sequence', []), - stream_sse=stream_flag) + use_default_badwordsids=genparams.get('use_default_badwordsids', True), + stream_sse=stream_flag, + grammar=genparams.get('grammar', '')) recvtxt = "" if stream_flag: @@ -404,11 +460,12 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): incomplete_token_buffer = bytearray() while not handle.has_finished(): - if current_token < handle.get_stream_count(): + streamcount = handle.get_stream_count() + while current_token < streamcount: token = handle.new_token(current_token) if token is None: # Token isnt ready yet, received nullpointer - continue + break current_token += 1 @@ -421,7 +478,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): event_str = json.dumps(event_data) await self.send_sse_event("message", event_str) - await asyncio.sleep(0) + await asyncio.sleep(0.02) #this should keep things responsive # flush buffers, sleep a bit to make sure all data sent, and then force close the connection self.wfile.flush() @@ -485,7 +542,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): response_body = (json.dumps({"values": []}).encode()) elif self.path.endswith(('/api/v1/info/version', '/api/latest/info/version')): - response_body = (json.dumps({"result":"1.2.2"}).encode()) + response_body = (json.dumps({"result":"1.2.4"}).encode()) + + elif self.path.endswith(('/api/extra/true_max_context_length')): #do not advertise this to horde + response_body = (json.dumps({"value": maxctx}).encode()) elif self.path.endswith(('/api/extra/version')): response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion}).encode()) @@ -495,7 +555,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): laste = handle.get_last_eval_time() lastc = handle.get_last_token_count() stopreason = handle.get_last_stop_reason() - response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason, "idle":(0 if modelbusy.locked() else 1)}).encode()) + response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason, "queue":requestsinqueue, "idle":(0 if modelbusy.locked() else 1)}).encode()) if response_body is None: self.send_response(404) @@ -510,7 +570,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): return def do_POST(self): - global modelbusy + global modelbusy, requestsinqueue content_length = int(self.headers['Content-Length']) body = self.rfile.read(content_length) basic_api_flag = False @@ -518,23 +578,48 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): kai_sse_stream_flag = False self.path = self.path.rstrip('/') + if self.path.endswith(('/api/extra/tokencount')): + try: + genparams = json.loads(body) + countprompt = genparams.get('prompt', "") + count = handle.token_count(countprompt.encode("UTF-8")) + self.send_response(200) + self.end_headers() + self.wfile.write(json.dumps({"value": count}).encode()) + + except ValueError as e: + utfprint("Count Tokens - Body Error: " + str(e)) + self.send_response(400) + self.end_headers() + self.wfile.write(json.dumps({"value": -1}).encode()) + return + if self.path.endswith('/api/extra/abort'): - ag = handle.abort_generate() - self.send_response(200) - self.end_headers() - self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode()) - print("\nGeneration Aborted") + if requestsinqueue==0: + ag = handle.abort_generate() + self.send_response(200) + self.end_headers() + self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode()) + print("\nGeneration Aborted") + else: + self.wfile.write(json.dumps({"success": "false"}).encode()) return if self.path.endswith('/api/extra/generate/check'): - pendtxt = handle.get_pending_output() - pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore") + pendtxtStr = "" + if requestsinqueue==0: + pendtxt = handle.get_pending_output() + pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore") self.send_response(200) self.end_headers() self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode()) return - if not modelbusy.acquire(blocking=False): + reqblocking = False + if args.multiuser and requestsinqueue < 4: #up to 5 concurrent requests + reqblocking = True + requestsinqueue += 1 + if not modelbusy.acquire(blocking=reqblocking): self.send_response(503) self.end_headers() self.wfile.write(json.dumps({"detail": { @@ -542,6 +627,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): "type": "service_unavailable", }}).encode()) return + if reqblocking: + requestsinqueue = (requestsinqueue - 1) if requestsinqueue>0 else 0 try: if self.path.endswith('/request'): @@ -646,7 +733,7 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None): exitcounter = 999 self.httpd.server_close() - numThreads = 8 + numThreads = 10 threadArr = [] for i in range(numThreads): threadArr.append(Thread(i)) @@ -669,7 +756,7 @@ def show_new_gui(): import tkinter as tk root = tk.Tk() #we dont want the useless window to be visible, but we want it in taskbar root.attributes("-alpha", 0) - args.model_param = askopenfilename(title="Select ggml model .bin files") + args.model_param = askopenfilename(title="Select ggml model .bin or .gguf files") root.destroy() if not args.model_param: print("\nNo ggml model file was selected. Exiting.") @@ -704,16 +791,17 @@ def show_new_gui(): (lib_openblas, "Use OpenBLAS"), (lib_clblast, "Use CLBlast"), (lib_cublas, "Use CuBLAS"), + (lib_hipblas, "Use hipBLAS (ROCm)"), (lib_default, "Use No BLAS"), (lib_noavx2, "NoAVX2 Mode (Old CPU)"), (lib_failsafe, "Failsafe Mode (Old CPU)")] - openblas_option, clblast_option, cublas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) + openblas_option, clblast_option, cublas_option, hipblas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) # slider data blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024", "2048"] blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024","2048"] contextsize_text = ["512", "1024", "2048", "3072", "4096", "6144", "8192", "12288", "16384"] - runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib) or os.name == 'nt' and file_exists(opt + ".dll")] - antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if not file_exists(lib) or os.name == 'nt' and not file_exists(opt + ".dll")] + runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)] + antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if not (opt in runopts)] if not any(runopts): show_gui_warning("No Backend Available") def tabbuttonaction(name): @@ -821,7 +909,7 @@ def show_new_gui(): debugmode = ctk.IntVar() lowvram_var = ctk.IntVar() - mmq_var = ctk.IntVar() + mmq_var = ctk.IntVar(value=1) blas_threads_var = ctk.StringVar() blas_size_var = ctk.IntVar() @@ -847,6 +935,7 @@ def show_new_gui(): port_var = ctk.StringVar(value=defaultport) host_var = ctk.StringVar(value="") + multiuser_var = ctk.IntVar() horde_name_var = ctk.StringVar(value="koboldcpp") horde_gen_var = ctk.StringVar(value=maxhordelen) horde_context_var = ctk.StringVar(value=maxhordectx) @@ -857,18 +946,10 @@ def show_new_gui(): # Quick Launch Tab quick_tab = tabcontent["Quick Launch"] - # gpu options - quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 5, 50) - quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3) - quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") - CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly") - quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 4,0) - quick_mmq_box = makecheckbox(quick_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1) - def changerunmode(a,b,c): index = runopts_var.get() - if index == "Use CLBlast" or index == "Use CuBLAS": + if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") if index == "Use CLBlast": @@ -876,7 +957,7 @@ def show_new_gui(): quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") if gpu_choice_var.get()=="All": gpu_choice_var.set("1") - elif index == "Use CuBLAS": + elif index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") else: @@ -887,7 +968,7 @@ def show_new_gui(): quick_gpu_selector_box.grid_forget() CUDA_quick_gpu_selector_box.grid_forget() - if index == "Use CuBLAS": + if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw") @@ -898,7 +979,7 @@ def show_new_gui(): mmq_box.grid_forget() quick_mmq_box.grid_forget() - if index == "Use CLBlast" or index == "Use CuBLAS": + if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw") gpu_layers_entry.grid(row=5, column=1, padx=8, pady=1, stick="nw") quick_gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw") @@ -919,6 +1000,14 @@ def show_new_gui(): # Tell user how many backends are available setup_backend_tooltip(quick_tab) + # gpu options + quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3) + quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly") + CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","4","All"], width=60, variable=gpu_choice_var, state="readonly") + quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 5, 50) + quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 4,0) + quick_mmq_box = makecheckbox(quick_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1) + # threads makelabelentry(quick_tab, "Threads:" , threads_var, 8, 50) @@ -938,25 +1027,23 @@ def show_new_gui(): # Hardware Tab hardware_tab = tabcontent["Hardware"] - # gpu options - gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50) - gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3) - gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly") - CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3", "All"], width=60, variable=gpu_choice_var, state="readonly") - lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0) - mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1) - # presets selector makelabel(hardware_tab, "Presets:", 1) runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly") runoptbox.grid(row=1, column=1,padx=8, stick="nw") runoptbox.set(runopts[0]) # Set to first available option - runopts_var.trace('w', changerunmode) - changerunmode(1,1,1) # Tell user how many backends are available setup_backend_tooltip(hardware_tab) + # gpu options + gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3) + gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly") + CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly") + gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50) + lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0) + mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1) + # threads makelabelentry(hardware_tab, "Threads:" , threads_var, 8, 50) @@ -973,6 +1060,9 @@ def show_new_gui(): # force version makelabelentry(hardware_tab, "Force Version:" , version_var, 100, 50) + runopts_var.trace('w', changerunmode) + changerunmode(1,1,1) + # Tokens Tab tokens_tab = tabcontent["Tokens"] # tokens checkboxes @@ -1025,14 +1115,16 @@ def show_new_gui(): makelabelentry(network_tab, "Port: ", port_var, 1, 150) makelabelentry(network_tab, "Host: ", host_var, 2, 150) + makecheckbox(network_tab, "Multiuser Mode", multiuser_var, 3) + # horde - makelabel(network_tab, "Horde:", 3).grid(pady=10) + makelabel(network_tab, "Horde:", 5).grid(pady=10) - horde_name_entry, horde_name_label = makelabelentry(network_tab, "Horde Model Name:", horde_name_var, 5, 180) - horde_gen_entry, horde_gen_label = makelabelentry(network_tab, "Gen. Length:", horde_gen_var, 6, 50) - horde_context_entry, horde_context_label = makelabelentry(network_tab, "Max Context:",horde_context_var, 7, 50) - horde_apikey_entry, horde_apikey_label = makelabelentry(network_tab, "API Key (If Embedded Worker):",horde_apikey_var, 8, 180) - horde_workername_entry, horde_workername_label = makelabelentry(network_tab, "Horde Worker Name:",horde_workername_var, 9, 180) + horde_name_entry, horde_name_label = makelabelentry(network_tab, "Horde Model Name:", horde_name_var, 7, 180) + horde_gen_entry, horde_gen_label = makelabelentry(network_tab, "Gen. Length:", horde_gen_var, 8, 50) + horde_context_entry, horde_context_label = makelabelentry(network_tab, "Max Context:",horde_context_var, 9, 50) + horde_apikey_entry, horde_apikey_label = makelabelentry(network_tab, "API Key (If Embedded Worker):",horde_apikey_var, 10, 180) + horde_workername_entry, horde_workername_label = makelabelentry(network_tab, "Horde Worker Name:",horde_workername_var, 11, 180) def togglehorde(a,b,c): labels = [horde_name_label, horde_gen_label, horde_context_label, horde_apikey_label, horde_workername_label] @@ -1047,13 +1139,13 @@ def show_new_gui(): basefile = os.path.basename(model_var.get()) horde_name_var.set(os.path.splitext(basefile)[0]) - makecheckbox(network_tab, "Configure for Horde", usehorde_var, 4, command=togglehorde) + makecheckbox(network_tab, "Configure for Horde", usehorde_var, 6, command=togglehorde) togglehorde(1,1,1) # launch def guilaunch(): if model_var.get() == "": - tmp = askopenfilename(title="Select ggml model .bin files") + tmp = askopenfilename(title="Select ggml model .bin or .gguf files") model_var.set(tmp) nonlocal nextstate nextstate = 1 @@ -1083,8 +1175,8 @@ def show_new_gui(): if gpu_choice_var.get()!="All": gpuchoiceidx = int(gpu_choice_var.get())-1 if runopts_var.get() == "Use CLBlast": - args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx] - if runopts_var.get() == "Use CuBLAS": + args.useclblast = [[0,0], [1,0], [0,1], [1,1]][gpuchoiceidx] + if runopts_var.get() == "Use CuBLAS" or runopts_var.get() == "Use hipBLAS (ROCm)": if gpu_choice_var.get()=="All": args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"] else: @@ -1118,6 +1210,7 @@ def show_new_gui(): args.port_param = defaultport if port_var.get()=="" else int(port_var.get()) args.host = host_var.get() + args.multiuser = multiuser_var.get() == 1 if horde_apikey_var.get()=="" or horde_workername_var.get()=="": args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get()] @@ -1139,14 +1232,17 @@ def show_new_gui(): if "useclblast" in dict and dict["useclblast"]: if clblast_option is not None: runopts_var.set(clblast_option) - gpu_choice_var.set(str(["0 0", "1 0", "0 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1)) + gpu_choice_var.set(str(["0 0", "1 0", "0 1", "1 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1)) elif "usecublas" in dict and dict["usecublas"]: - if cublas_option is not None: - runopts_var.set(cublas_option) + if cublas_option is not None or hipblas_option is not None: + if cublas_option: + runopts_var.set(cublas_option) + elif hipblas_option: + runopts_var.set(cublas_option) lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0) mmq_var.set(1 if "mmq" in dict["usecublas"] else 0) gpu_choice_var.set("All") - for g in range(3): + for g in range(4): if str(g) in dict["usecublas"]: gpu_choice_var.set(str(g+1)) break @@ -1204,6 +1300,8 @@ def show_new_gui(): if "host" in dict and dict["host"]: host_var.set(dict["host"]) + multiuser_var.set(1 if "multiuser" in dict and dict["multiuser"] else 0) + if "hordeconfig" in dict and dict["hordeconfig"] and len(dict["hordeconfig"]) > 1: horde_name_var.set(dict["hordeconfig"][0]) horde_gen_var.set(dict["hordeconfig"][1]) @@ -1421,7 +1519,7 @@ def show_old_gui(): root = tk.Tk() root.attributes("-alpha", 0) - args.model_param = askopenfilename(title="Select ggml model .bin files") + args.model_param = askopenfilename(title="Select ggml model .bin or .gguf files") root.destroy() if not args.model_param: print("\nNo ggml model file was selected. Exiting.") @@ -1431,7 +1529,7 @@ def show_old_gui(): else: root = tk.Tk() #we dont want the useless window to be visible, but we want it in taskbar root.attributes("-alpha", 0) - args.model_param = askopenfilename(title="Select ggml model .bin files") + args.model_param = askopenfilename(title="Select ggml model .bin or .gguf files") root.destroy() if not args.model_param: print("\nNo ggml model file was selected. Exiting.") @@ -1562,8 +1660,66 @@ def run_horde_worker(args, api_key, worker_name): time.sleep(2) sys.exit(2) -def main(args): +def unload_libs(): + global handle + import platform + OS = platform.system() + dll_close = None + if OS == "Windows": # pragma: Windows + from ctypes import wintypes + ctypes.windll.kernel32.FreeLibrary.argtypes = [wintypes.HMODULE] + dll_close = ctypes.windll.kernel32.FreeLibrary + elif OS == "Darwin": + try: + try: # macOS 11 (Big Sur). Possibly also later macOS 10s. + stdlib = ctypes.CDLL("libc.dylib") + except OSError: + stdlib = ctypes.CDLL("libSystem") + except OSError: + # Older macOSs. Not only is the name inconsistent but it's + # not even in PATH. + stdlib = ctypes.CDLL("/usr/lib/system/libsystem_c.dylib") + dll_close = stdlib.dlclose + elif OS == "Linux": + try: + stdlib = ctypes.CDLL("") + except OSError: + stdlib = ctypes.CDLL("libc.so") # Alpine Linux. + dll_close = stdlib.dlclose + elif sys.platform == "msys": + # msys can also use `ctypes.CDLL("kernel32.dll").FreeLibrary()`. + stdlib = ctypes.CDLL("msys-2.0.dll") + dll_close = stdlib.dlclose + elif sys.platform == "cygwin": + stdlib = ctypes.CDLL("cygwin1.dll") + dll_close = stdlib.dlclose + elif OS == "FreeBSD": + # FreeBSD uses `/usr/lib/libc.so.7` where `7` is another version number. + # It is not in PATH but using its name instead of its path is somehow the + # only way to open it. The name must include the .so.7 suffix. + stdlib = ctypes.CDLL("libc.so.7") + dll_close = stdlib.close + + if handle and dll_close: + print("Unloading Libraries...") + dll_close(handle._handle) + del handle + handle = None + +def main(launch_args,start_server=True): + global args + args = launch_args embedded_kailite = None + if args.config: + if isinstance(args.config, str) and os.path.exists(args.config): + with open(args.config, 'r') as f: + config = json.load(f) + for key, value in config.items(): + setattr(args, key, value) + else: + print("Specified kcpp config file invalid or not found.") + time.sleep(2) + sys.exit(2) if not args.model_param: args.model_param = args.model if not args.model_param: @@ -1692,8 +1848,22 @@ def main(args): horde_thread.daemon = True horde_thread.start() - print(f"Please connect to custom endpoint at {epurl}") - asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite)) + #if post-ready script specified, execute it + if args.onready: + def onready_subprocess(): + import subprocess + print("Starting Post-Load subprocess...") + subprocess.Popen(args.onready[0], shell=True) + timer_thread = threading.Timer(1, onready_subprocess) #1 second delay + timer_thread.start() + + if start_server: + print(f"Please connect to custom endpoint at {epurl}") + asyncio.run(RunServerMultiThreaded(args.host, args.port, embedded_kailite)) + else: + print(f"Server was not started, main function complete. Idling.") + # while True: + # time.sleep(5) if __name__ == '__main__': print("***\nWelcome to KoboldCpp - Version " + KcppVersion) # just update version manually @@ -1708,6 +1878,7 @@ if __name__ == '__main__': parser.add_argument("--host", help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="") parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true') parser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+') + parser.add_argument("--config", help="Load settings from a .kcpps file. Other arguments will be ignored", type=str, nargs='?') physical_core_limit = 1 if os.cpu_count()!=None and os.cpu_count()>1: physical_core_limit = int(os.cpu_count()/2) @@ -1734,10 +1905,10 @@ if __name__ == '__main__': compatgroup = parser.add_mutually_exclusive_group() compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) - compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', 'mmq']) + compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq']) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') + parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", type=str, default="",nargs=1) + parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them. Polled-streaming is disabled while multiple requests are in queue.", action='store_true') - args = parser.parse_args() - - main(args) + main(parser.parse_args(),start_server=True) \ No newline at end of file diff --git a/llama.cpp b/llama.cpp index 74c1f635ecc71f8dceedb9e60969f3762d407b0c..e9eae1d04864ac6c37b4c596e7dc82edabe0fb56 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,15 +1,10 @@ -// Defines fileno on msys: -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#include -#include -#include -#endif - -#include "llama-util.h" +#define LLAMA_API_INTERNAL #include "llama.h" #include "ggml.h" + +#include "ggml-alloc.h" + #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" #endif @@ -18,76 +13,454 @@ #endif #ifdef GGML_USE_METAL -#include "ggml-metal.h" +# include "ggml-metal.h" #endif #ifdef GGML_USE_MPI -#include "ggml-mpi.h" +# include "ggml-mpi.h" #endif #ifdef GGML_USE_K_QUANTS -#ifndef QK_K -#ifdef GGML_QKK_64 -#define QK_K 64 -#else -#define QK_K 256 +# ifndef QK_K +# ifdef GGML_QKK_64 +# define QK_K 64 +# else +# define QK_K 256 +# endif +# endif #endif + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif + #endif #endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include + #include // for _fseeki64 #endif +#include #include -#include +#include #include +#include +#include +#include +#include +#include +#include +#include #include -#include +#include #include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include #include -#include #include +#include +#include +#include +#include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) -#include "ggml-alloc.h" -#define LLAMA_USE_ALLOCATOR +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif #else -#define LLAMA_USE_SCRATCH -#define LLAMA_MAX_SCRATCH_BUFFERS 16 +#define LLAMA_ATTRIBUTE_FORMAT(...) #endif +// +// logging +// -// available llama models -enum e_model { - MODEL_UNKNOWN, - MODEL_3B, - MODEL_7B, - MODEL_13B, - MODEL_30B, - MODEL_65B, - MODEL_70B, +LLAMA_ATTRIBUTE_FORMAT(2, 3) +static void llama_log_internal (llama_log_level level, const char* format, ...); +static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data); + +#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) + +// +// helpers +// + +static size_t utf8_len(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; + } + s = std::move(result); +} +#ifdef GGML_USE_CPU_HBM +#include +#endif + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +LLAMA_ATTRIBUTE_FORMAT(1, 2) +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +// +// gguf constants (sync with gguf.py) +// + +enum llm_arch { + LLM_ARCH_LLAMA, + LLM_ARCH_FALCON, + LLM_ARCH_BAICHUAN, + LLM_ARCH_GPT2, + LLM_ARCH_GPTJ, + LLM_ARCH_GPTNEOX, + LLM_ARCH_MPT, + LLM_ARCH_STARCODER, + LLM_ARCH_UNKNOWN, }; -static const size_t kB = 1024; -static const size_t MB = 1024*1024; +static std::map LLM_ARCH_NAMES = { + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, +}; -// computed for n_ctx == 2048 -// TODO: dynamically determine these sizes -// needs modifications in ggml +enum llm_kv { + LLM_KV_GENERAL_ARCHITECTURE, + LLM_KV_GENERAL_QUANTIZATION_VERSION, + LLM_KV_GENERAL_ALIGNMENT, + LLM_KV_GENERAL_NAME, + LLM_KV_GENERAL_AUTHOR, + LLM_KV_GENERAL_URL, + LLM_KV_GENERAL_DESCRIPTION, + LLM_KV_GENERAL_LICENSE, + LLM_KV_GENERAL_SOURCE_URL, + LLM_KV_GENERAL_SOURCE_HF_REPO, + + LLM_KV_CONTEXT_LENGTH, + LLM_KV_EMBEDDING_LENGTH, + LLM_KV_BLOCK_COUNT, + LLM_KV_FEED_FORWARD_LENGTH, + LLM_KV_USE_PARALLEL_RESIDUAL, + LLM_KV_TENSOR_DATA_LAYOUT, + + LLM_KV_ATTENTION_HEAD_COUNT, + LLM_KV_ATTENTION_HEAD_COUNT_KV, + LLM_KV_ATTENTION_MAX_ALIBI_BIAS, + LLM_KV_ATTENTION_CLAMP_KQV, + LLM_KV_ATTENTION_LAYERNORM_EPS, + LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + + LLM_KV_ROPE_DIMENSION_COUNT, + LLM_KV_ROPE_FREQ_BASE, + LLM_KV_ROPE_SCALE_LINEAR, + + LLM_KV_TOKENIZER_MODEL, + LLM_KV_TOKENIZER_LIST, + LLM_KV_TOKENIZER_TOKEN_TYPE, + LLM_KV_TOKENIZER_SCORES, + LLM_KV_TOKENIZER_MERGES, + LLM_KV_TOKENIZER_BOS_ID, + LLM_KV_TOKENIZER_EOS_ID, + LLM_KV_TOKENIZER_UNK_ID, + LLM_KV_TOKENIZER_SEP_ID, + LLM_KV_TOKENIZER_PAD_ID, + LLM_KV_TOKENIZER_HF_JSON, + LLM_KV_TOKENIZER_RWKV, +}; -typedef void (*offload_func_t)(struct ggml_tensor * tensor); +static std::map LLM_KV_NAMES = { + { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, + { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, + { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, + { LLM_KV_GENERAL_NAME, "general.name" }, + { LLM_KV_GENERAL_AUTHOR, "general.author" }, + { LLM_KV_GENERAL_URL, "general.url" }, + { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, + { LLM_KV_GENERAL_LICENSE, "general.license" }, + { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" }, + { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" }, + + { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, + { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, + { LLM_KV_BLOCK_COUNT, "%s.block_count" }, + { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, + { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, + { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, + + { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, + { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, + { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, + { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, + { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, + { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, + + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, +}; -void llama_nop(struct ggml_tensor * tensor) { // don't offload by default - (void) tensor; +struct LLM_KV { + LLM_KV(llm_arch arch) : arch(arch) {} + + llm_arch arch; + + std::string operator()(llm_kv kv) const { + return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str()); + } +}; + +enum llm_tensor { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_POS_EMBD, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_NORM_2, + LLM_TENSOR_ATTN_ROT_EMBD, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_NORM, +}; + +static std::map> LLM_TENSOR_NAMES = { + { + LLM_ARCH_LLAMA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_BAICHUAN, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_FALCON, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_GPT2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + }, + }, + { + LLM_ARCH_GPTJ, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + }, + }, + { + LLM_ARCH_GPTNEOX, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_MPT, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + }, + }, + { + LLM_ARCH_STARCODER, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, + { + LLM_ARCH_UNKNOWN, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + }, + }, +}; + +static llm_arch llm_arch_from_string(const std::string & name) { + for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT + if (kv.second == name) { + return kv.first; + } + } + + return LLM_ARCH_UNKNOWN; +} + +// helper to handle gguf constants +// usage: +// +// const auto tn = LLM_TN(LLM_ARCH_LLAMA); +// +// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output" +// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias" +// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight" +// +struct LLM_TN { + LLM_TN(llm_arch arch) : arch(arch) {} + + llm_arch arch; + + std::string operator()(llm_tensor tensor) const { + return LLM_TENSOR_NAMES[arch].at(tensor); + } + + std::string operator()(llm_tensor tensor, const std::string & suffix) const { + return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; + } + + std::string operator()(llm_tensor tensor, int bid) const { + return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); + } + + std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { + return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; + } +}; + +// +// gguf helpers +// + +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ + } \ } // @@ -106,215 +479,641 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * } // -// memory sizes (calculated for n_batch == 512) +// llama helpers // -static const std::map & MEM_REQ_SCRATCH0(int n_ctx) -{ - static std::map k_sizes = { - { MODEL_3B, ((size_t) n_ctx / 16ull + 156ull) * MB }, - { MODEL_7B, ((size_t) n_ctx / 16ull + 164ull) * MB }, - { MODEL_13B, ((size_t) n_ctx / 12ull + 184ull) * MB }, - { MODEL_30B, ((size_t) n_ctx / 9ull + 224ull) * MB }, - { MODEL_65B, ((size_t) n_ctx / 6ull + 320ull) * MB }, // guess - { MODEL_70B, ((size_t) n_ctx / 6ull + 320ull) * MB }, - }; - return k_sizes; -} - -static const std::map & MEM_REQ_SCRATCH1() -{ - static std::map k_sizes = { - { MODEL_3B, 192ull * MB }, - { MODEL_7B, 224ull * MB }, - { MODEL_13B, 256ull * MB }, - { MODEL_30B, 320ull * MB }, - { MODEL_65B, 448ull * MB }, // guess - { MODEL_70B, 448ull * MB }, - }; - return k_sizes; -} - -// used to store the compute graph tensors + non-scratch data -static const std::map & MEM_REQ_EVAL() -{ - static std::map k_sizes = { - { MODEL_3B, 16ull * MB }, - { MODEL_7B, 20ull * MB }, - { MODEL_13B, 24ull * MB }, - { MODEL_30B, 32ull * MB }, - { MODEL_65B, 48ull * MB }, // guess - { MODEL_70B, 48ull * MB }, - }; - return k_sizes; -} - -// amount of VRAM needed per batch size to hold temporary results -// the values for 3b are not derived from testing but instead chosen conservatively -static const std::map & VRAM_REQ_SCRATCH_BASE() -{ - static std::map k_sizes = { - { MODEL_3B, 512ull * kB }, - { MODEL_7B, 512ull * kB }, - { MODEL_13B, 640ull * kB }, - { MODEL_30B, 768ull * kB }, - { MODEL_65B, 1360ull * kB }, - { MODEL_70B, 1360ull * kB }, - }; - return k_sizes; -} - -// amount of VRAM needed per batch size and context to hold temporary results -// the values for 3b are not derived from testing but instead chosen conservatively -static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() -{ - static std::map k_sizes = { - { MODEL_3B, 128ull }, - { MODEL_7B, 128ull }, - { MODEL_13B, 160ull }, - { MODEL_30B, 208ull }, - { MODEL_65B, 320ull }, - { MODEL_70B, 320ull }, - }; - return k_sizes; +#ifdef GGML_USE_CUBLAS +# define llama_host_malloc(n) ggml_cuda_host_malloc(n) +# define llama_host_free(data) ggml_cuda_host_free(data) +#elif GGML_USE_METAL +# define llama_host_malloc(n) ggml_metal_host_malloc(n) +# define llama_host_free(data) ggml_metal_host_free(data) +#elif GGML_USE_CPU_HBM +# define llama_host_malloc(n) hbw_malloc(n) +# define llama_host_free(data) if (data != NULL) hbw_free(data) +#else +# define llama_host_malloc(n) malloc(n) +# define llama_host_free(data) free(data) +#endif + +#if defined(_WIN32) +static std::string llama_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; } +#endif -// default hparams (LLaMA 7B) -struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_mult = 256; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; +struct llama_buffer { + void * data = NULL; + size_t size = 0; - // LLaMAv2 - // TODO: load from model data hparams - float f_ffn_mult = 1.0f; - float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; + // fallback to malloc / free + // useful in cases where CUDA can try to allocate PINNED memory + bool fallback = false; - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; + void resize(size_t n) { + llama_host_free(data); - enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; + data = llama_host_malloc(n); + if (!data) { + fallback = true; + data = malloc(n); + } else { + fallback = false; + } - bool operator!=(const llama_hparams & other) const { - return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT + GGML_ASSERT(data); + size = n; } - uint32_t n_gqa() const { - return n_head/n_head_kv; + ~llama_buffer() { + if (data) { + if (fallback) { // NOLINT + free(data); + } else { + llama_host_free(data); + } + } + + data = NULL; } +}; - uint32_t n_embd_head() const { - return n_embd/n_head; +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); } - uint32_t n_embd_gqa() const { - return n_embd/n_gqa(); + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; } - size_t kv_size() const { - size_t result = 2ull; - result *= (size_t) n_embd_gqa(); - result *= (size_t) n_ctx; - result *= (size_t) n_layer; - result *= sizeof(ggml_fp16_t); - return result; + void seek(size_t offset, int whence) const { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same } -}; -struct llama_layer { - // normalization - struct ggml_tensor * attention_norm; + void read_raw(void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, len, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error(std::string("unexpectedly reached end of file")); + } + } - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; + uint32_t read_u32() const { + uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } - // normalization - struct ggml_tensor * ffn_norm; + void write_raw(const void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, len, 1, fp); + if (ret != 1) { + throw std::runtime_error(format("write error: %s", strerror(errno))); + } + } - // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + void write_u32(std::uint32_t val) const { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } }; -struct llama_kv_cache { - struct ggml_tensor * k = NULL; - struct ggml_tensor * v = NULL; +struct llama_mmap { + void * addr; + size_t size; - struct ggml_context * ctx = NULL; + llama_mmap(const llama_mmap &) = delete; - llama_ctx_buffer buf; +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; - int n; // number of tokens currently in the cache + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); + } - ~llama_kv_cache() { - if (ctx) { - ggml_free(ctx); + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) { + fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) { + fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", + strerror(errno)); + } } + } -#ifdef GGML_USE_CUBLAS - ggml_cuda_free_data(k); - ggml_cuda_free_data(v); -#endif // GGML_USE_CUBLAS + ~llama_mmap() { + munmap(addr, size); } -}; +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; -struct llama_vocab { - using id = int32_t; - using token = std::string; + llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { + (void) numa; - struct token_score { - token tok; - float score; - }; + size = file->size; - std::unordered_map token_to_id; - std::vector id_to_token; -}; + HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); -struct llama_model { - e_model type = MODEL_UNKNOWN; + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + DWORD error = GetLastError(); - llama_hparams hparams; + if (hMapping == NULL) { + throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); + } - struct ggml_tensor * tok_embeddings; + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + error = GetLastError(); + CloseHandle(hMapping); - struct ggml_tensor * norm; - struct ggml_tensor * output; + if (addr == NULL) { + throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); + } - std::vector layers; - int n_gpu_layers; + #ifndef USE_FAILSAFE + if (prefetch) { + // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it + BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); + HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); + + // may fail on pre-Windows 8 systems + pPrefetchVirtualMemory = reinterpret_cast (GetProcAddress(hKernel32, "PrefetchVirtualMemory")); + + if (pPrefetchVirtualMemory) { + // advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } + } + #else + printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n"); + #endif + } - // context - struct ggml_context * ctx = NULL; + ~llama_mmap() { + if (!UnmapViewOfFile(addr)) { + fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; - // the model memory buffer - llama_ctx_buffer buf; + llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { + (void) file; + (void) prefetch; + (void) numa; - // model memory mapped file - std::unique_ptr mapping; + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; - // objects representing data potentially being locked in memory - llama_mlock mlock_buf; - llama_mlock mlock_mmap; +// Represents some region of memory being locked using mlock or VirtualLock; +// will automatically unlock on destruction. +struct llama_mlock { + void * addr = NULL; + size_t size = 0; - // for quantize-stats only - std::vector> tensors_by_name; + bool failed_already = false; - int64_t t_load_us = 0; - int64_t t_start_us = 0; + llama_mlock() {} + llama_mlock(const llama_mlock &) = delete; - llama_vocab vocab; + ~llama_mlock() { + if (size) { + raw_unlock(addr, size); + } + } - ~llama_model() { + void init(void * ptr) { + GGML_ASSERT(addr == NULL && size == 0); // NOLINT + addr = ptr; + } + + void grow_to(size_t target_size) { + GGML_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + + static size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + #ifdef __APPLE__ + #define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + #else + #define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + #endif + + bool raw_lock(const void * addr, size_t size) const { + if (!mlock(addr, size)) { + return true; + } + + char* errmsg = std::strerror(errno); + bool suggest = (errno == ENOMEM); + + // Check if the resource limit is fine after all + struct rlimit lock_limit; + if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) { + suggest = false; + } + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) { + suggest = false; + } + + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + return false; + } + + #undef MLOCK_SUGGESTION + + static void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + static size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * ptr, size_t len) const { + for (int tries = 1; ; tries++) { + if (VirtualLock(ptr, len)) { + return true; + } + if (tries == 2) { + fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + len, size, llama_format_win_err(GetLastError()).c_str()); + return false; + } + + // It failed but this was only the first try; increase the working + // set size and try again. + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + // Per MSDN: "The maximum number of pages that a process can lock + // is equal to the number of pages in its minimum working set minus + // a small overhead." + // Hopefully a megabyte is enough overhead: + size_t increment = len + 1048576; + // The minimum must be <= the maximum, so we need to increase both: + min_ws_size += increment; + max_ws_size += increment; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + static void raw_unlock(void * ptr, size_t len) { + if (!VirtualUnlock(ptr, len)) { + fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + static size_t lock_granularity() { + return (size_t) 65536; + } + + bool raw_lock(const void * addr, size_t len) const { + fprintf(stderr, "warning: mlock not supported on this system\n"); + return false; + } + + static void raw_unlock(const void * addr, size_t len) {} +#endif +}; + +typedef void (*offload_func_t)(struct ggml_tensor * tensor); + +static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} + +static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { + std::vector result(8, 0); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); + if (n_tokens < 0) { + result.resize(-n_tokens); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); + GGML_ASSERT(check == -n_tokens); + } else { + result.resize(n_tokens); + } + + return std::string(result.data(), result.size()); +} + +// +// globals +// + +struct llama_state { + // We save the log callback globally + llama_log_callback log_callback = llama_log_callback_default; + void * log_callback_user_data = nullptr; +}; + +static llama_state g_state; + +// available llama models +enum e_model { + MODEL_UNKNOWN, + MODEL_1B, + MODEL_3B, + MODEL_7B, + MODEL_13B, + MODEL_15B, + MODEL_30B, + MODEL_34B, + MODEL_40B, + MODEL_65B, + MODEL_70B, +}; + +static const size_t kB = 1024; +static const size_t MB = kB*kB; +static const size_t GB = kB*kB*kB; + +struct llama_hparams { + uint32_t n_vocab; + uint32_t n_ctx_train; // context size the model was trained on + uint32_t n_ctx; // context size used during inference + uint32_t n_embd; + uint32_t n_head; + uint32_t n_head_kv; + uint32_t n_layer; + uint32_t n_rot; + uint32_t n_ff; + + float f_norm_eps; + float f_norm_rms_eps; + + float rope_freq_base; + float rope_freq_scale; + + bool operator!=(const llama_hparams & other) const { + return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT + } + + uint32_t n_gqa() const { + return n_head/n_head_kv; + } + + uint32_t n_embd_head() const { + return n_embd/n_head; + } + + uint32_t n_embd_gqa() const { + return n_embd/n_gqa(); + } + + size_t kv_size() const { + size_t result = 2ull; + result *= (size_t) n_embd_gqa(); + result *= (size_t) n_ctx; + result *= (size_t) n_layer; + result *= sizeof(ggml_fp16_t); + return result; + } +}; + +struct llama_layer { + // normalization + struct ggml_tensor * attn_norm; + struct ggml_tensor * attn_norm_b; + struct ggml_tensor * attn_norm_2; + struct ggml_tensor * attn_norm_2_b; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + struct ggml_tensor * wqkv; + + // attention bias + struct ggml_tensor * bo; + struct ggml_tensor * bqkv; + + // normalization + struct ggml_tensor * ffn_norm; + struct ggml_tensor * ffn_norm_b; + + // ff + struct ggml_tensor * w1; // ffn_gate + struct ggml_tensor * w2; // ffn_down + struct ggml_tensor * w3; // ffn_up + + // ff bias + struct ggml_tensor * b2; // ffn_down + struct ggml_tensor * b3; // ffn_up +}; + +struct llama_kv_cache { + struct ggml_tensor * k = NULL; + struct ggml_tensor * v = NULL; + + struct ggml_context * ctx = NULL; + + llama_buffer buf; + + int n; // number of tokens currently in the cache + + ~llama_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + +#ifdef GGML_USE_CUBLAS + ggml_cuda_free_data(k); + ggml_cuda_free_data(v); +#endif // GGML_USE_CUBLAS + } +}; + +struct llama_vocab { + using id = int32_t; + using token = std::string; + using ttype = llama_token_type; + + struct token_data { + token text; + float score; + ttype type; + }; + + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + + std::unordered_map token_to_id; + std::vector id_to_token; + + std::map, int> bpe_ranks; + + // default LLaMA special tokens + id special_bos_id = 1; + id special_eos_id = 2; + id special_unk_id = 0; + id special_sep_id = -1; + id special_pad_id = -1; + + id linefeed_id = 13; + + int find_bpe_rank(std::string token_left, std::string token_right) const { + replace_all(token_left, " ", "\u0120"); + replace_all(token_left, "\n", "\u010A"); + replace_all(token_right, " ", "\u0120"); + replace_all(token_right, "\n", "\u010A"); + + auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); + if (it == bpe_ranks.end()) { + return -1; + } + + return it->second; + } +}; + +struct llama_model { + e_model type = MODEL_UNKNOWN; + llm_arch arch = LLM_ARCH_UNKNOWN; + llama_ftype ftype = LLAMA_FTYPE_ALL_F32; + + std::string name = "n/a"; + + llama_hparams hparams = {}; + llama_vocab vocab; + + struct ggml_tensor * tok_embeddings; + struct ggml_tensor * pos_embeddings; + + struct ggml_tensor * output_norm; + struct ggml_tensor * output_norm_b; + struct ggml_tensor * output; + + std::vector layers; + + int n_gpu_layers; + + // context + struct ggml_context * ctx = NULL; + + // the model memory buffer + llama_buffer buf; + + // model memory mapped file + std::unique_ptr mapping; + + // objects representing data potentially being locked in memory + llama_mlock mlock_buf; + llama_mlock mlock_mmap; + + // for quantize-stats only + std::vector> tensors_by_name; + + int64_t t_load_us = 0; + int64_t t_start_us = 0; + + ~llama_model() { if (ctx) { ggml_free(ctx); } @@ -343,11 +1142,9 @@ struct llama_context { ggml_metal_free(ctx_metal); } #endif -#ifdef LLAMA_USE_ALLOCATOR if (alloc) { ggml_allocr_free(alloc); } -#endif } std::mt19937 rng; @@ -372,8 +1169,6 @@ struct llama_context { // key + value cache for the self attention struct llama_kv_cache kv_self; - size_t mem_per_token = 0; - // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; bool logits_all = false; @@ -385,19 +1180,10 @@ struct llama_context { std::vector work_buffer; // memory buffers used to evaluate the model - // TODO: move in llama_state - llama_ctx_buffer buf_compute; + llama_buffer buf_compute; -#ifdef LLAMA_USE_ALLOCATOR - llama_ctx_buffer buf_alloc; + llama_buffer buf_alloc; ggml_allocr * alloc = NULL; -#endif - -#ifdef LLAMA_USE_SCRATCH - llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; - int buf_last = 0; - size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; -#endif #ifdef GGML_USE_METAL ggml_metal_context * ctx_metal = NULL; @@ -406,401 +1192,396 @@ struct llama_context { #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; #endif +}; - void use_buf(struct ggml_context * ctx, int i) { -#if defined(LLAMA_USE_SCRATCH) - size_t last_size = 0; +// +// kv cache helpers +// - if (i == -1) { - last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); - } else { - auto & buf = buf_scratch[i]; - last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); - } +static bool llama_kv_cache_init( + const struct llama_hparams & hparams, + struct llama_kv_cache & cache, + ggml_type wtype, + int n_ctx, + int n_gpu_layers) { + const int n_embd = hparams.n_embd_gqa(); + const int n_layer = hparams.n_layer; - if (buf_last >= 0) { - buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); - } + const int64_t n_mem = n_layer*n_ctx; + const int64_t n_elements = n_embd*n_mem; - buf_last = i; -#else - (void) i; - (void) ctx; -#endif - } + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + cache.n = 0; - size_t get_buf_max_mem(int i) const { -#if defined(LLAMA_USE_SCRATCH) - return buf_max_size[i]; -#else - (void) i; - return 0; -#endif + struct ggml_init_params params; + params.mem_size = cache.buf.size; + params.mem_buffer = cache.buf.data; + params.no_alloc = false; + + cache.ctx = ggml_init(params); + + if (!cache.ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); + return false; } -}; -template -static T checked_mul(T a, T b) { - T ret = a * b; - if (a != 0 && ret / a != b) { - throw std::runtime_error(format("overflow multiplying %llu * %llu", - (unsigned long long) a, (unsigned long long) b)); + cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + ggml_set_name(cache.k, "cache_k"); + ggml_set_name(cache.v, "cache_v"); + + (void) n_gpu_layers; +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer + 1) { + ggml_cuda_assign_buffers_no_scratch(cache.v); } - return ret; + if (n_gpu_layers > n_layer + 2) { + ggml_cuda_assign_buffers_no_scratch(cache.k); + } +#endif // GGML_USE_CUBLAS + + return true; } -static size_t checked_div(size_t a, size_t b) { - if (b == 0 || a % b != 0) { - throw std::runtime_error(format("error dividing %zu / %zu", a, b)); +// +// model loading and saving +// + +enum llama_fver { + GGUF_FILE_VERSION_V1 = 1, + GGUF_FILE_VERSION_V2 = 2, +}; + +static const char * llama_file_version_name(llama_fver version) { + switch (version) { + case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; + case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)"; } - return a / b; + + return "unknown"; } -static std::string llama_format_tensor_shape(const std::vector & ne) { +static std::string llama_format_tensor_shape(const std::vector & ne) { char buf[256]; - snprintf(buf, sizeof(buf), "%5u", ne.at(0)); + snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0)); for (size_t i = 1; i < ne.size(); i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i)); + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i)); } return buf; } -static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { - size_t size = ggml_type_size(type); - for (uint32_t dim : ne) { - size = checked_mul(size, dim); +static std::string llama_format_tensor_shape(const struct ggml_tensor * t) { + char buf[256]; + snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]); } - return size / ggml_blck_size(type); + return buf; } -struct llama_load_tensor { - std::string name; - enum ggml_type type = GGML_TYPE_F32; - std::vector ne; - size_t file_off; - size_t size; - struct ggml_tensor * ggml_tensor = NULL; - uint8_t * data; -}; +struct llama_model_loader { + int n_kv = 0; + int n_tensors = 0; + int n_created = 0; -struct llama_load_tensors_map { - // tensors is kept in a separate vector to preserve file order - std::vector tensors; - std::unordered_map name_to_idx; -}; + int64_t n_elements = 0; + size_t n_bytes = 0; -enum llama_file_version { - LLAMA_FILE_VERSION_GGML, - LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab - LLAMA_FILE_VERSION_GGJT_V1, // added padding - LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format - LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format -}; + bool use_mmap = false; -struct llama_file_loader { - llama_file file; - llama_file_version file_version; - llama_hparams hparams; - llama_vocab vocab; - - llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map) - : file(fname, "rb") { - fprintf(stderr, "llama.cpp: loading model from %s\n", fname); - read_magic(); - read_hparams(); - read_vocab(); - read_tensor_metadata(tensors_map); - } - void read_magic() { - uint32_t magic = file.read_u32(); - - if (magic == LLAMA_FILE_MAGIC_GGML) { - file_version = LLAMA_FILE_VERSION_GGML; - return; + llama_file file; + llama_ftype ftype; + llama_fver fver; + + std::unique_ptr mapping; + + struct gguf_context * ctx_gguf = NULL; + struct ggml_context * ctx_meta = NULL; + + llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + ctx_gguf = gguf_init_from_file(fname.c_str(), params); + if (!ctx_gguf) { + throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } - uint32_t version = file.read_u32(); + n_kv = gguf_get_n_kv(ctx_gguf); + n_tensors = gguf_get_n_tensors(ctx_gguf); - switch (magic) { - case LLAMA_FILE_MAGIC_GGMF: - switch (version) { - case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return; - } - break; - case LLAMA_FILE_MAGIC_GGJT: - switch (version) { - case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return; - case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return; - case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return; - } + fver = (enum llama_fver ) gguf_get_version(ctx_gguf); + + for (int i = 0; i < n_tensors; i++) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); + n_elements += ggml_nelements(t); + n_bytes += ggml_nbytes(t); } - throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", - magic, version)); - } - void read_hparams() { - hparams.n_vocab = file.read_u32(); - hparams.n_embd = file.read_u32(); - hparams.n_mult = file.read_u32(); - hparams.n_head = file.read_u32(); - hparams.n_layer = file.read_u32(); - hparams.n_rot = file.read_u32(); - hparams.ftype = (enum llama_ftype) file.read_u32(); + LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", + __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); - // LLaMAv2 - // TODO: read from header - hparams.n_head_kv = hparams.n_head; - } - void read_vocab() { - vocab.id_to_token.resize(hparams.n_vocab); + // determine file type based on the number of tensors for each quantization and print meta data + // TODO: make optional + if(false) //disable this log for now + { + std::map n_type; - for (uint32_t i = 0; i < hparams.n_vocab; i++) { - uint32_t len = file.read_u32(); - std::string word = file.read_string(len); + uint32_t n_type_max = 0; + enum ggml_type type_max = GGML_TYPE_F32; - float score = 0.0f; - file.read_raw(&score, sizeof(score)); + for (int i = 0; i < n_tensors; i++) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name); - vocab.token_to_id[word] = i; + n_type[meta->type]++; - auto & tok_score = vocab.id_to_token[i]; - tok_score.tok = std::move(word); - tok_score.score = score; - } - } - void read_tensor_metadata(llama_load_tensors_map & tensors_map) { - while (file.tell() < file.size) { - llama_load_tensor tensor; - uint32_t n_dims = file.read_u32(); - uint32_t name_len = file.read_u32(); - tensor.type = (enum ggml_type) file.read_u32(); - tensor.ne.resize(n_dims); - file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims); - std::string name = file.read_string(name_len); - if (n_dims < 1 || n_dims > 2) { - throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); - } - switch (tensor.type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - break; - default: { - throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type)); + if (n_type_max < n_type[meta->type]) { + n_type_max = n_type[meta->type]; + type_max = meta->type; } + + LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); } - // skip to the next multiple of 32 bytes - if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { - file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); + switch (type_max) { + case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; + case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; + case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; + case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; + case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; + case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; + case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; + case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; + case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; + case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; + case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; + case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + default: + { + LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); + ftype = LLAMA_FTYPE_ALL_F32; + } break; } - tensor.file_off = file.tell(); - tensor.name = name; - tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type); - file.seek(tensor.size, SEEK_CUR); + // this is a way to mark that we have "guessed" the file type + ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); - tensors_map.tensors.push_back(tensor); - tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1; - } - } -}; + { + const int kid = gguf_find_key(ctx_gguf, "general.file_type"); + if (kid >= 0) { + ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid); + } + } -struct llama_file_saver { - llama_file file; - llama_file_loader * any_file_loader; - llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype) - : file(fname, "wb"), any_file_loader(any_file_loader) { - fprintf(stderr, "llama.cpp: saving model to %s\n", fname); - write_magic(); - write_hparams(new_ftype); - write_vocab(); - } - void write_magic() { - file.write_u32(LLAMA_FILE_MAGIC); // magic - file.write_u32(LLAMA_FILE_VERSION); // version - } - void write_hparams(enum llama_ftype new_ftype) { - const llama_hparams & hparams = any_file_loader->hparams; - file.write_u32(hparams.n_vocab); - file.write_u32(hparams.n_embd); - file.write_u32(hparams.n_mult); - file.write_u32(hparams.n_head); - file.write_u32(hparams.n_layer); - file.write_u32(hparams.n_rot); - file.write_u32(new_ftype); - } - void write_vocab() { - if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { - fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); - } - uint32_t n_vocab = any_file_loader->hparams.n_vocab; - for (uint32_t i = 0; i < n_vocab; i++) { - const auto & token_score = any_file_loader->vocab.id_to_token.at(i); - file.write_u32((uint32_t) token_score.tok.size()); - file.write_raw(token_score.tok.data(), token_score.tok.size()); - file.write_raw(&token_score.score, sizeof(token_score.score)); - } - } - void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { - switch (new_type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - break; - default: LLAMA_ASSERT(false); - } - file.write_u32((uint32_t) tensor.ne.size()); - file.write_u32((uint32_t) tensor.name.size()); - file.write_u32(new_type); - file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); - file.write_raw(tensor.name.data(), tensor.name.size()); - file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); - LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); - file.write_raw(new_data, new_size); - } -}; + for (int i = 0; i < n_kv; i++) { + const char * name = gguf_get_key(ctx_gguf, i); + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); -struct llama_model_loader { - std::unique_ptr file_loader; - llama_load_tensors_map tensors_map; - bool use_mmap; - size_t num_ggml_tensors_created = 0; - struct ggml_context * ggml_ctx = NULL; - std::unique_ptr mapping; + LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type)); + } + + // print type counts + for (auto & kv : n_type) { + if (kv.second == 0) { + continue; + } + + LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + } + } - llama_model_loader(const std::string & fname_base, bool use_mmap) { - file_loader = std::unique_ptr(new llama_file_loader(fname_base.c_str(), tensors_map)); if (!llama_mmap::SUPPORTED) { + LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); use_mmap = false; } + this->use_mmap = use_mmap; } - void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { - *ctx_size_p = *mmapped_size_p = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { - *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; - *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16; + ~llama_model_loader() { + if (ctx_gguf) { + gguf_free(ctx_gguf); + } + if (ctx_meta) { + ggml_free(ctx_meta); } } - struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { - auto it = tensors_map.name_to_idx.find(name); - if (it == tensors_map.name_to_idx.end()) { - throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str()))); - } - llama_load_tensor & lt = tensors_map.tensors.at(it->second); - if (lt.ne != ne) { - throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", - name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str())); - } + std::string get_arch_name() const { + const auto kv = LLM_KV(LLM_ARCH_UNKNOWN); + + std::string arch_name; + GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE)); - return get_tensor_for(lt, backend); + return arch_name; } - struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) { - struct ggml_tensor * tensor; - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ggml_ctx, true); + enum llm_arch get_arch() const { + const std::string arch_name = get_arch_name(); + + return llm_arch_from_string(arch_name); + } + + const char * get_tensor_name(int i) const { + return gguf_get_tensor_name(ctx_gguf, i); + } + + struct ggml_tensor * get_tensor_meta(int i) const { + return ggml_get_tensor(ctx_meta, get_tensor_name(i)); + } + + void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const { + ctx_size_p = 0; + mmapped_size_p = 0; + + for (int i = 0; i < n_tensors; i++) { + struct ggml_tensor * meta = get_tensor_meta(i); + ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta); } - if (lt.ne.size() == 2) { - tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); - } else { - LLAMA_ASSERT(lt.ne.size() == 1); - tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); + } + + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) { + if (backend != GGML_BACKEND_CPU) { + ggml_set_no_alloc(ctx, true); } - ggml_set_name(tensor, lt.name.c_str()); - LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + + struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); + tensor->backend = backend; // TODO: ggml_set_backend + ggml_set_name(tensor, ggml_get_name(meta)); if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ggml_ctx, use_mmap); + ggml_set_no_alloc(ctx, use_mmap); } - tensor->backend = backend; - lt.ggml_tensor = tensor; - num_ggml_tensors_created++; + + n_created++; + return tensor; } + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend backend) { + struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); + + if (cur == NULL) { + throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); + } + + { + bool is_ok = true; + for (size_t i = 0; i < ne.size(); ++i) { + if (ne[i] != cur->ne[i]) { + is_ok = false; + break; + } + } + if (!is_ok) { + throw std::runtime_error( + format("%s: tensor '%s' has wrong shape; expected %s, got %s", + __func__, name.c_str(), + llama_format_tensor_shape(ne).c_str(), + llama_format_tensor_shape(cur).c_str())); + } + } + + return create_tensor_for(ctx, cur, backend); + } + void done_getting_tensors() const { - if (num_ggml_tensors_created != tensors_map.tensors.size()) { - throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected")); + if (n_created != n_tensors) { + throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); + } + } + + size_t file_offset(const char * name) const { + const int idx = gguf_find_tensor(ctx_gguf, name); + + if (idx < 0) { + throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name)); + } + + return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); + } + + void load_data_for(struct ggml_tensor * cur) const { + const size_t offs = file_offset(ggml_get_name(cur)); + + if (use_mmap) { + cur->data = (uint8_t *) mapping->addr + offs; + } else { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); } } - void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { - size_t data_size = 0; - size_t prefetch_size = file_loader->file.size; - size_t lock_size = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { - data_size += lt.size; - if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { - prefetch_size -= lt.size; + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + size_t size_data = 0; + size_t size_lock = 0; + size_t size_pref = 0; // prefetch + + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); + size_data += ggml_nbytes(cur); + if (cur->backend == GGML_BACKEND_CPU) { + size_pref += ggml_nbytes(cur); } } if (use_mmap) { - mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa())); + mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa())); if (lmlock) { lmlock->init(mapping->addr); } } size_t done_size = 0; - for (llama_load_tensor & lt : tensors_map.tensors) { + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); + GGML_ASSERT(cur); // unused tensors should have been caught by load_data already + if (progress_callback) { - progress_callback((float) done_size / data_size, progress_callback_user_data); + progress_callback((float) done_size / size_data, progress_callback_user_data); } - LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already - lt.data = (uint8_t *) lt.ggml_tensor->data; // allocate temp buffer if not using mmap - if (!use_mmap && lt.data == NULL) { - GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU); - lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor)); + if (!use_mmap && cur->data == NULL) { + GGML_ASSERT(cur->backend != GGML_BACKEND_CPU); + #ifdef GGML_USE_CPU_HBM + cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur)); + #else + cur->data = (uint8_t*)malloc(ggml_nbytes(cur)); + #endif } - load_data_for(lt); + load_data_for(cur); - switch(lt.ggml_tensor->backend) { + switch (cur->backend) { case GGML_BACKEND_CPU: - lt.ggml_tensor->data = lt.data; if (use_mmap && lmlock) { - lock_size += lt.size; - lmlock->grow_to(lock_size); + size_lock += ggml_nbytes(cur); + lmlock->grow_to(size_lock); } break; #if defined(GGML_USE_CUBLAS) case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: - ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); + // old code: + //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); + + // TODO: test if this works !! + ggml_cuda_transform_tensor(cur->data, cur); if (!use_mmap) { - free(lt.data); + free(cur->data); } break; #elif defined(GGML_USE_CLBLAST) case GGML_BACKEND_GPU: - ggml_cl_transform_tensor(lt.data, lt.ggml_tensor); + ggml_cl_transform_tensor(cur->data, cur); if (!use_mmap) { - free(lt.data); + free(cur->data); } break; #endif @@ -808,186 +1589,20 @@ struct llama_model_loader { continue; } - done_size += lt.size; - } - } - - void load_data_for(llama_load_tensor & lt) { - if (use_mmap) { - lt.data = (uint8_t *) mapping->addr + lt.file_off; - } else { - llama_file & file = file_loader->file; - file.seek(lt.file_off, SEEK_SET); - file.read_raw(lt.data, lt.size); - } - - if (0) { - print_checksum(lt); - } - } - - static void print_checksum(llama_load_tensor & lt) { - uint32_t sum = 0; - for (size_t i = 0; i < lt.size; i++) { - uint8_t byte = lt.data[i]; - sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash + done_size += ggml_nbytes(cur); } - fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, - llama_format_tensor_shape(lt.ne).c_str(), lt.size); } - }; // -// kv cache -// - -static bool kv_cache_init( - const struct llama_hparams & hparams, - struct llama_kv_cache & cache, - ggml_type wtype, - int n_ctx, - int n_gpu_layers) { - const int n_embd = hparams.n_embd_gqa(); - const int n_layer = hparams.n_layer; - - const int64_t n_mem = n_layer*n_ctx; - const int64_t n_elements = n_embd*n_mem; - - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); - cache.n = 0; - - struct ggml_init_params params; - params.mem_size = cache.buf.size; - params.mem_buffer = cache.buf.addr; - params.no_alloc = false; - - cache.ctx = ggml_init(params); - - if (!cache.ctx) { - fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); - return false; - } - - cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - ggml_set_name(cache.k, "cache_k"); - ggml_set_name(cache.v, "cache_v"); - - (void) n_gpu_layers; -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer + 1) { - ggml_cuda_assign_buffers_no_scratch(cache.v); - } - if (n_gpu_layers > n_layer + 2) { - ggml_cuda_assign_buffers_no_scratch(cache.k); - } -#endif // GGML_USE_CUBLAS - - return true; -} - -struct llama_context_params llama_context_default_params() { - struct llama_context_params result = { - /*.seed =*/ LLAMA_DEFAULT_SEED, - /*.n_ctx =*/ 512, - /*.n_batch =*/ 512, - /*.n_gqa =*/ 1, - /*.rms_norm_eps =*/ LLAMA_DEFAULT_RMS_EPS, - /*.gpu_layers =*/ 0, - /*.main_gpu =*/ 0, - /*.tensor_split =*/ nullptr, - /*.rope_freq_base =*/ 10000.0f, - /*.rope_freq_scale =*/ 1.0f, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, - /*.low_vram =*/ false, - /*.mul_mat_q =*/ false, - /*.f16_kv =*/ true, - /*.logits_all =*/ false, - /*.vocab_only =*/ false, - /*.use_mmap =*/ true, - /*.use_mlock =*/ false, - /*.embedding =*/ false, - }; - - return result; -} - -struct llama_model_quantize_params llama_model_quantize_default_params() { - struct llama_model_quantize_params result = { - /*.nthread =*/ 0, - /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, - /*.allow_requantize =*/ false, - /*.quantize_output_tensor =*/ true, - }; - - return result; -} - -int llama_max_devices() { - return LLAMA_MAX_DEVICES; -} - -bool llama_mmap_supported() { - return llama_mmap::SUPPORTED; -} - -bool llama_mlock_supported() { - return llama_mlock::SUPPORTED; -} - -int get_blas_batch_mul(int batch) -{ - return (batch>512?(batch>1024?4:2):1); -} - -void llama_backend_init(bool numa) { - ggml_time_init(); - - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } - - if (numa) { - ggml_numa_init(); - } - -#ifdef GGML_USE_MPI - ggml_mpi_backend_init(); -#endif -} - -void llama_backend_free() { -#ifdef GGML_USE_MPI - ggml_mpi_backend_free(); -#endif -} - -int64_t llama_time_us() { - return ggml_time_us(); -} - -// -// model loading +// load LLaMA models // -static const char *llama_file_version_name(llama_file_version version) { - switch (version) { - case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; - case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; - case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; - case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; - case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; +static std::string llama_model_ftype_name(enum llama_ftype ftype) { + if (ftype & LLAMA_FTYPE_GUESSED) { + return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; } - return "unknown"; -} - -static const char *llama_ftype_name(enum llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; @@ -998,8 +1613,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; + // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; @@ -1007,161 +1623,348 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; - default: return "unknown, may not work"; + case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; + + default: return "unknown, may not work"; } } -static const char *llama_model_type_name(e_model type) { +static const char * llama_model_type_name(e_model type) { switch (type) { - case MODEL_3B: return "3B"; - case MODEL_7B: return "7B"; + case MODEL_1B: return "1B"; + case MODEL_3B: return "3B"; + case MODEL_7B: return "7B"; case MODEL_13B: return "13B"; + case MODEL_15B: return "15B"; case MODEL_30B: return "30B"; + case MODEL_34B: return "34B"; + case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; - default: LLAMA_ASSERT(false); + default: return "?B"; } } -static void llama_model_load_internal( - const std::string & fname, +static void llm_load_arch(llama_model_loader & ml, llama_model & model) { + model.arch = ml.get_arch(); + if (model.arch == LLM_ARCH_UNKNOWN) { + throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'"); + } +} + +static void llm_load_hparams( + llama_model_loader & ml, llama_model & model, - llama_vocab & vocab, int n_ctx, - int n_batch, - int n_gqa, - float rms_norm_eps, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - const bool mul_mat_q, float rope_freq_base, - float rope_freq_scale, - bool low_vram, - ggml_type memory_type, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void * progress_callback_user_data) { - - model.t_start_us = ggml_time_us(); - size_t blasbatchmul = get_blas_batch_mul(n_batch); - - std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); + float rope_freq_scale) { + struct gguf_context * ctx = ml.ctx_gguf; - vocab = std::move(ml->file_loader->vocab); - model.hparams = ml->file_loader->hparams; - model.n_gpu_layers = n_gpu_layers; - llama_file_version file_version = ml->file_loader->file_version; + const auto kv = LLM_KV(model.arch); auto & hparams = model.hparams; - // TODO: read from file - hparams.f_rms_norm_eps = rms_norm_eps; - - { - switch (hparams.n_layer) { - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; - default: - { - if (hparams.n_layer < 32) { - model.type = e_model::MODEL_7B; - } - } break; - } + // get general kv + GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); - hparams.n_ctx = n_ctx; + // get hparams kv + GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST)); + GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); + GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); + GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); - // LLaMAv2 - // TODO: temporary until GGUF - //patch for llama2 gqa - if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) { - fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__); - n_gqa = 8; - } - LLAMA_ASSERT(hparams.n_head % n_gqa == 0); - hparams.n_head_kv = hparams.n_head / n_gqa; - if (model.type == e_model::MODEL_65B && n_gqa == 8) { - fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); - model.type = e_model::MODEL_70B; - hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model - } + // n_head_kv is optional, default to n_head + hparams.n_head_kv = hparams.n_head; + GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); - hparams.rope_freq_base = rope_freq_base; - hparams.rope_freq_scale = rope_freq_scale; + // rope_freq_base (optional) + if (rope_freq_base == 0.0f) { + rope_freq_base = 10000.0f; + GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); } - // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199 - const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3; - const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw; - const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; - //const uint32_t n_ff = 28672; + // rope_freq_scale (inverse of the kv) is optional + if (rope_freq_scale == 0.0f) { + float ropescale = 1.0f; + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + rope_freq_scale = 1.0f/ropescale; + } + // sanity check for n_rot (optional) { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); - fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); - fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); - } + hparams.n_rot = hparams.n_embd / hparams.n_head; - if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { - if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && - hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && - hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { - printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"); + GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); + + if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) { + if (hparams.n_rot != hparams.n_embd / hparams.n_head) { + throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head)); + } } + // gpt-neox n_rot = rotary_pct * (n_embd / n_head) + // gpt-j n_rot = rotary_dim } - if (file_version < LLAMA_FILE_VERSION_GGJT_V3) { - if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || - hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || - hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { - printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"); + // arch-specific KVs + switch (model.arch) { + case LLM_ARCH_LLAMA: + { + GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + + switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_3B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 48: model.type = e_model::MODEL_34B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_FALCON: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 60: model.type = e_model::MODEL_40B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_BAICHUAN: + { + GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_STARCODER: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; + case 36: model.type = e_model::MODEL_3B; break; + case 42: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_15B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + default: (void)0; + }; + + model.ftype = ml.ftype; + + hparams.n_ctx = n_ctx; + hparams.rope_freq_base = rope_freq_base; + hparams.rope_freq_scale = rope_freq_scale; +} + +// TODO: This should probably be in llama.h +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos); +static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); + +static void llm_load_vocab( + llama_model_loader & ml, + llama_model & model) { + auto & vocab = model.vocab; + + struct gguf_context * ctx = ml.ctx_gguf; + + const auto kv = LLM_KV(model.arch); + + const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); + if (token_idx == -1) { + throw std::runtime_error("cannot find tokenizer vocab in model file\n"); + } + + const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); + if (score_idx == -1) { + throw std::runtime_error("cannot find tokenizer scores in model file\n"); + } + + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); + if (toktype_idx == -1) { + throw std::runtime_error("cannot find token type list in GGUF file\n"); + } + + const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + + // determine vocab type + { + std::string tokenizer_name; + + GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL)); + + if (tokenizer_name == "llama") { + vocab.type = LLAMA_VOCAB_TYPE_SPM; + + // default special tokens + vocab.special_bos_id = 1; + vocab.special_eos_id = 2; + vocab.special_unk_id = 0; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + } else if (tokenizer_name == "gpt2") { + vocab.type = LLAMA_VOCAB_TYPE_BPE; + + // read bpe merges and populate bpe ranks + const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); + if (merges_keyidx == -1) { + throw std::runtime_error("cannot find tokenizer merges in model file\n"); + } + + const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); + + for (int i = 0; i < n_merges; i++) { + const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); + + std::string first; + std::string second; + + const size_t pos = word.find(' ', 1); + + if (pos != std::string::npos) { + first = word.substr(0, pos); + second = word.substr(pos + 1); + } + + vocab.bpe_ranks.emplace(std::make_pair(first, second), i); + } + + // default special tokens + vocab.special_bos_id = 11; + vocab.special_eos_id = 11; + vocab.special_unk_id = -1; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + } else { + LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); + LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); + + vocab.type = LLAMA_VOCAB_TYPE_SPM; } } - if (vocab_only) { - return; + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + + vocab.id_to_token.resize(n_vocab); + + for (uint32_t i = 0; i < n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab.token_to_id[word] = i; + + auto & token_data = vocab.id_to_token[i]; + token_data.text = std::move(word); + token_data.score = scores[i]; + token_data.type = (llama_token_type) toktypes[i]; + } + + // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' + if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } else { + vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0]; + } + + // special tokens + GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); + GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID)); + GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID)); + GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID)); + GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID)); +} + +static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; + + // hparams + LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); + LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); + LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix + LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); + LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); + LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); + LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); + LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); + LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); + LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); + LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); + LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); + LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); + if (ml.n_bytes < GB) { + LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); + } else { + LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } - auto & ctx = model.ctx; + // general kv + LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); + + // special tokens + if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } + if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } + if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } + if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } + if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } + if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } +} + +static void llm_load_tensors( + llama_model_loader & ml, + llama_model & model, + int n_batch, + int n_gpu_layers, + int main_gpu, + const float * tensor_split, + const bool mul_mat_q, + bool low_vram, + ggml_type memory_type, + bool use_mlock, + llama_progress_callback progress_callback, + void * progress_callback_user_data) { + model.t_start_us = ggml_time_us(); + + auto & ctx = model.ctx; + auto & hparams = model.hparams; + + model.n_gpu_layers = n_gpu_layers; size_t ctx_size; size_t mmapped_size; - ml->calc_sizes(&ctx_size, &mmapped_size); - fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); + + ml.calc_sizes(ctx_size, mmapped_size); + + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { model.buf.resize(ctx_size); if (use_mlock) { - model.mlock_buf.init (model.buf.addr); + model.mlock_buf.init (model.buf.data); model.mlock_buf.grow_to(model.buf.size); } - struct ggml_init_params params = { - /*.mem_size =*/ model.buf.size, - /*.mem_buffer =*/ model.buf.addr, - /*.no_alloc =*/ ml->use_mmap, - }; + struct ggml_init_params params; + /*.mem_size =*/ params.mem_size = model.buf.size; + /*.mem_buffer =*/ params.mem_buffer = model.buf.data; + /*.no_alloc =*/ params.no_alloc = ml.use_mmap; + model.ctx = ggml_init(params); if (!model.ctx) { @@ -1172,13 +1975,13 @@ static void llama_model_load_internal( (void) main_gpu; (void) mul_mat_q; #if defined(GGML_USE_CUBLAS) - fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); + LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); ggml_cuda_set_mul_mat_q(mul_mat_q); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #elif defined(GGML_USE_CLBLAST) - fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__); + LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU #else @@ -1188,80 +1991,311 @@ static void llama_model_load_internal( // prepare memory for the weights size_t vram_weights = 0; - size_t vram_scratch = 0; { - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_embd_gqa = hparams.n_embd_gqa(); - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; + const int64_t n_embd = hparams.n_embd; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_layer = hparams.n_layer; + const int64_t n_vocab = hparams.n_vocab; + + const auto tn = LLM_TN(model.arch); + switch (model.arch) { + case LLM_ARCH_LLAMA: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + // output + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 - ml->ggml_ctx = ctx; + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } - model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - // "output" tensor - { - ggml_backend backend_norm; - ggml_backend backend_output; - if (n_gpu_layers > int(n_layer)) { // NOLINT - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying - // on Windows however this is detrimental unless everything is on the GPU + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + + layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); + layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + + layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } + } + } break; + case LLM_ARCH_BAICHUAN: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU #ifndef _WIN32 - backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #else - backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; #endif // _WIN32 - backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } - model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm); - model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } - } + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } - const int i_gpu_start = n_layer - n_gpu_layers; + const uint32_t n_ff = hparams.n_ff; - model.layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const int i_gpu_start = n_layer - n_gpu_layers; - auto & layer = model.layers[i]; + model.layers.resize(n_layer); - std::string layers_i = "layers." + std::to_string(i); + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT - layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); + auto & layer = model.layers[i]; - layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); - layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); + layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); + layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split); - layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split); - layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); - } - } + layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } + } + } break; + case LLM_ARCH_FALCON: + { + // TODO: CPU-only for now + + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + // output + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { + layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); + layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(layer.attn_norm_2); + vram_weights += ggml_nbytes(layer.attn_norm_2_b); + } + } + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } + } + } break; + case LLM_ARCH_STARCODER: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + + // output + { + ggml_backend backend_norm; + ggml_backend backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + + ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) + + ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3); + } + } + } break; + default: + throw std::runtime_error("unknown architecture"); + }; } - ml->done_getting_tensors(); + ml.done_getting_tensors(); // print memory requirements { @@ -1272,64 +2306,43 @@ static void llama_model_load_internal( ctx_size + mmapped_size - vram_weights; // weights in VRAM not in memory -#ifndef LLAMA_USE_ALLOCATOR - mem_required += - blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + - blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) + - blasbatchmul*MEM_REQ_EVAL().at(model.type); -#endif - // this is the memory required by one llama_state - const size_t mem_required_state = - scale*hparams.kv_size(); + const size_t mem_required_state = scale*hparams.kv_size(); - fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - (void) vram_scratch; (void) n_batch; -#ifdef GGML_USE_CUBLAS - if (low_vram) { - fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); - ggml_cuda_set_scratch_size(0); // disable scratch - } else { - const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type); - const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type); - vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context); - ggml_cuda_set_scratch_size(vram_scratch); - if (n_gpu_layers > 0) { - fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n", - __func__, vram_scratch_base / kB, vram_scratch_per_context, - (vram_scratch + MB - 1) / MB); // round up - } - } -#endif // GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); + LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); if (n_gpu_layers > (int) hparams.n_layer) { - fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__); + LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); } size_t vram_kv_cache = 0; #ifdef GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; +#if defined(GGML_USE_HIPBLAS) + const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3; +#else const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; +#endif if (n_gpu_layers > (int) hparams.n_layer + 1) { if (low_vram) { - fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); + LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); } else { - fprintf(stderr, "%s: offloading v cache to GPU\n", __func__); + LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); vram_kv_cache += hparams.kv_size() / 2; } } if (n_gpu_layers > (int) hparams.n_layer + 2) { if (low_vram) { - fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); + LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); } else { - fprintf(stderr, "%s: offloading k cache to GPU\n", __func__); + LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); vram_kv_cache += hparams.kv_size() / 2; } } @@ -1338,18 +2351,19 @@ static void llama_model_load_internal( const int max_offloadable_layers = hparams.n_layer + 1; #endif // GGML_USE_CUBLAS - fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n", + LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - fprintf(stderr, "%s: total VRAM used: %zu MB\n", - __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up + LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n", + __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up #else (void) n_gpu_layers; #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) } // populate `tensors_by_name` - for (llama_load_tensor & lt : ml->tensors_map.tensors) { - model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); + for (int i = 0; i < ml.n_tensors; ++i) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i)); + model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } (void) tensor_split; @@ -1359,13 +2373,13 @@ static void llama_model_load_internal( } #endif - ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); + ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); } - model.mapping = std::move(ml->mapping); + model.mapping = std::move(ml.mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration @@ -1375,11 +2389,8 @@ static void llama_model_load_internal( static bool llama_model_load( const std::string & fname, llama_model & model, - llama_vocab & vocab, int n_ctx, int n_batch, - int n_gqa, - float rms_norm_eps, int n_gpu_layers, int main_gpu, const float * tensor_split, @@ -1394,24 +2405,43 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, - main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, - use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); - return true; + std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); + + llm_load_arch (*ml, model); + llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale); + llm_load_vocab (*ml, model); + + llm_load_print_meta(*ml, model); + + if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { + throw std::runtime_error("vocab size mismatch"); + } + + if (vocab_only) { + LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); + return true; + } + + llm_load_tensors( + *ml, model, n_batch, n_gpu_layers, + main_gpu, tensor_split, mul_mat_q, low_vram, memory_type, + use_mlock, progress_callback, progress_callback_user_data); } catch (const std::exception & err) { - fprintf(stderr, "error loading model: %s\n", err.what()); + LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); return false; } + + return true; } -static struct ggml_cgraph * llama_build_graph( +static struct ggml_cgraph * llm_build_llama( llama_context & lctx, const llama_token * tokens, const float * embd, int n_tokens, int n_past) { - LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT const int N = n_tokens; @@ -1420,7 +2450,7 @@ static struct ggml_cgraph * llama_build_graph( const auto & kv_self = lctx.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); + GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; @@ -1430,27 +2460,23 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); - LLAMA_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_rot); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float rms_norm_eps = hparams.f_rms_norm_eps; + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_rms_eps = hparams.f_norm_rms_eps; const int n_gpu_layers = model.n_gpu_layers; - auto & mem_per_token = lctx.mem_per_token; - auto & buf_compute = lctx.buf_compute; - + auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.addr, + /*.mem_buffer =*/ buf_compute.data, /*.no_alloc =*/ false, }; -#ifdef LLAMA_USE_ALLOCATOR params.no_alloc = true; -#endif struct ggml_context * ctx0 = ggml_init(params); @@ -1462,14 +2488,10 @@ static struct ggml_cgraph * llama_build_graph( if (tokens) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); -#ifdef LLAMA_USE_ALLOCATOR ggml_allocr_alloc(lctx.alloc, inp_tokens); if (!ggml_allocr_is_measure(lctx.alloc)) { memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); } -#else - memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); -#endif ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); @@ -1480,14 +2502,10 @@ static struct ggml_cgraph * llama_build_graph( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); -#ifdef LLAMA_USE_ALLOCATOR ggml_allocr_alloc(lctx.alloc, inpL); if (!ggml_allocr_is_measure(lctx.alloc)) { memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); } -#else - memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); -#endif } const int i_gpu_start = n_layer - n_gpu_layers; @@ -1504,25 +2522,21 @@ static struct ggml_cgraph * llama_build_graph( #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers; + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; } if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers; + offload_func_v = ggml_cuda_assign_buffers_no_alloc; } if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers; + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; } #endif // GGML_USE_CUBLAS struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); -#ifdef LLAMA_USE_ALLOCATOR ggml_allocr_alloc(lctx.alloc, KQ_scale); if (!ggml_allocr_is_measure(lctx.alloc)) { ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); } -#else - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); -#endif ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); for (int il = 0; il < n_layer; ++il) { @@ -1532,22 +2546,20 @@ static struct ggml_cgraph * llama_build_graph( #ifdef GGML_USE_CUBLAS if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers; + offload_func = ggml_cuda_assign_buffers_no_alloc; } #endif // GGML_USE_CUBLAS struct ggml_tensor * inpSA = inpL; - lctx.use_buf(ctx0, 0); - // norm { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); offload_func(cur); ggml_set_name(cur, "rms_norm_0"); - // cur = cur*attention_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); + // cur = cur*attn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); offload_func(cur); ggml_set_name(cur, "attention_norm_0"); } @@ -1598,19 +2610,16 @@ static struct ggml_cgraph * llama_build_graph( ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa), - n_embd_head, n_head_kv, n_past + N), - 0, 2, 1, 3); + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); offload_func_kq(K); ggml_set_name(K, "K"); @@ -1639,9 +2648,9 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, n_past + N, n_embd_head, n_head_kv, - n_ctx*ggml_element_size(kv_self.v), - n_ctx*ggml_element_size(kv_self.v)*n_embd_head, - n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il); + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); offload_func_v(V); ggml_set_name(V, "V"); @@ -1677,8 +2686,6 @@ static struct ggml_cgraph * llama_build_graph( ggml_set_name(cur, "result_wo"); } - lctx.use_buf(ctx0, 1); - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); offload_func(inpFF); ggml_set_name(inpFF, "inpFF"); @@ -1687,7 +2694,7 @@ static struct ggml_cgraph * llama_build_graph( { // norm { - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); offload_func(cur); ggml_set_name(cur, "rms_norm_1"); @@ -1733,16 +2740,16 @@ static struct ggml_cgraph * llama_build_graph( inpL = cur; } - lctx.use_buf(ctx0, 0); + cur = inpL; // norm { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); offload_func_nr(cur); ggml_set_name(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.norm); + cur = ggml_mul(ctx0, cur, model.output_norm); // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); } @@ -1751,56 +2758,22 @@ static struct ggml_cgraph * llama_build_graph( cur = ggml_mul_mat(ctx0, model.output, cur); ggml_set_name(cur, "result_output"); - lctx.use_buf(ctx0, -1); - - // logits -> probs - //cur = ggml_soft_max_inplace(ctx0, cur); - ggml_build_forward_expand(gf, cur); - if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; - } - -#if 0 - printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, - ggml_used_mem(ctx0)/1024.0/1024.0, - lctx.get_buf_max_mem(0)/1024.0/1024.0, - lctx.get_buf_max_mem(1)/1024.0/1024.0, - lctx.work_buffer.size()/1024.0/1024.0, - n_past, N); -#endif - ggml_free(ctx0); return gf; } -// evaluate the transformer -// -// - lctx: llama context -// - tokens: new batch of tokens to process -// - embd embeddings input -// - n_tokens number of tokens -// - n_past: the context size so far -// - n_threads: number of threads to use -// -static bool llama_eval_internal( + +static struct ggml_cgraph * llm_build_baichaun( llama_context & lctx, const llama_token * tokens, const float * embd, int n_tokens, - int n_past, - int n_threads, - const char * cgraph_fname) { - - LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); - - const int64_t t_start_us = ggml_time_us(); + int n_past) { -#ifdef GGML_USE_MPI - ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); -#endif + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT const int N = n_tokens; @@ -1809,284 +2782,1507 @@ static bool llama_eval_internal( const auto & kv_self = lctx.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); + GGML_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; - const int64_t n_vocab = hparams.n_vocab; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); -#ifdef LLAMA_USE_ALLOCATOR - ggml_allocr_reset(lctx.alloc); -#endif + GGML_ASSERT(n_embd_head == hparams.n_rot); - ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_rms_eps = hparams.f_norm_rms_eps; -#ifdef LLAMA_USE_ALLOCATOR - ggml_allocr_alloc_graph(lctx.alloc, gf); -#endif - - // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + const int n_gpu_layers = model.n_gpu_layers; - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + auto & buf_compute = lctx.buf_compute; - LLAMA_ASSERT(strcmp(res->name, "result_output") == 0); - LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; -#if GGML_USE_MPI - const int64_t n_layer = hparams.n_layer; - ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); -#endif + params.no_alloc = true; -#ifdef GGML_USE_METAL - if (lctx.ctx_metal && N == 1) { - // TODO: disabled until #2413 is resolved - //if (!ggml_metal_if_optimized(lctx.ctx_metal)) { - // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf); - //} - ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); - ggml_metal_graph_compute(lctx.ctx_metal, gf); - ggml_metal_get_tensor (lctx.ctx_metal, res); - if (!lctx.embedding.empty()) { - ggml_metal_get_tensor(lctx.ctx_metal, embeddings); - } - } else { - // IMPORTANT: - // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla - // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX - // coprocessor. - // - // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch. - // But for now, we have focused only on Matrix x Vector Metal multiplication. - // - // TODO: avoid these syncs via shared memory (ref #1696) - // - if (lctx.ctx_metal) { - // We need to sync the GPU KV cache with the CPU KV cache - ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k); - ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); - } + struct ggml_context * ctx0 = ggml_init(params); - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); - } -#else - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); -#endif + ggml_cgraph * gf = ggml_new_graph(ctx0); -#if GGML_USE_MPI - ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); -#endif + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - // update kv token count - lctx.kv_self.n = n_past + N; + if (tokens) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - if (cgraph_fname) { - ggml_graph_export(gf, cgraph_fname); - } + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } + ggml_set_name(inp_tokens, "inp_tokens"); -#ifdef GGML_PERF - // print timing information per ggml operation (for debugging purposes) - // requires GGML_PERF to be defined - ggml_graph_print(gf); + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); #endif - // plot the computation graph in dot format (for debugging purposes) - //if (n_past%100 == 0) { - // ggml_graph_dump_dot(gf, NULL, "llama.dot"); - //} - - // extract logits - { - auto & logits_out = lctx.logits; + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); - if (lctx.logits_all) { - logits_out.resize(n_vocab * N); - memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N); - } else { - // return result for just the last token - logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); } } - // extract embeddings - if (!lctx.embedding.empty()) { - auto & embedding_out = lctx.embedding; + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; - embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); - } + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + // + // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal + // in that case ggml_cuda_assign_buffers has no effect + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; - // measure the performance only for the single-token evals - if (N == 1) { - lctx.t_eval_us += ggml_time_us() - t_start_us; - lctx.n_eval++; +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; } - else if (N > 1) { - lctx.t_p_eval_us += ggml_time_us() - t_start_us; - lctx.n_p_eval += N; + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS - return true; -} - -// -// tokenizer -// - -static size_t utf8_len(char src) { - const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - uint8_t highbits = static_cast(src) >> 4; - return lookup[highbits]; -} + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); -struct llama_sp_symbol { - using index = int; - index prev; - index next; - const char * text; - size_t n; -}; + for (int il = 0; il < n_layer; ++il) { + ggml_format_name(inpL, "layer_inp_%d", il); -static_assert(std::is_trivially_copyable::value, "llama_sp_symbol is not trivially copyable"); + offload_func_t offload_func = llama_nop; -struct llama_sp_bigram { - struct comparator { - bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { - return (l.score < r.score) || (l.score == r.score && l.left > r.left); +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers_no_alloc; } - }; - using queue_storage = std::vector; - using queue = std::priority_queue; - llama_sp_symbol::index left; - llama_sp_symbol::index right; - float score; - size_t size; -}; +#endif // GGML_USE_CUBLAS -// original implementation: -// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 -struct llama_tokenizer { - llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} + struct ggml_tensor * inpSA = inpL; - void tokenize(const std::string & text, std::vector & output) { - // split string into utf8 chars - int index = 0; - size_t offs = 0; - while (offs < text.size()) { - llama_sp_symbol sym; - size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); - sym.text = text.c_str() + offs; - sym.n = char_len; - offs += char_len; - sym.prev = index - 1; - sym.next = offs == text.size() ? -1 : index + 1; - index++; - symbols_.emplace_back(sym); - } + // norm + { + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); + offload_func(cur); + ggml_set_name(cur, "rms_norm_0"); - // seed the work queue with all possible 2-character tokens. - for (size_t i = 1; i < symbols_.size(); ++i) { - try_add_bigram(i - 1, i); + // cur = cur*attn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + offload_func(cur); + ggml_set_name(cur, "attention_norm_0"); } - // keep substituting the highest frequency pairs for as long as we can. - while (!work_queue_.empty()) { - auto bigram = work_queue_.top(); - work_queue_.pop(); + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + offload_func_kq(tmpk); + ggml_set_name(tmpk, "tmpk"); - auto & left_sym = symbols_[bigram.left]; - auto & right_sym = symbols_[bigram.right]; + struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + offload_func_kq(tmpq); + ggml_set_name(tmpq, "tmpq"); - // if one of the symbols already got merged, skip it. - if (left_sym.n == 0 || right_sym.n == 0 || - left_sym.n + right_sym.n != bigram.size) { - continue; + struct ggml_tensor * Kcur; + struct ggml_tensor * Qcur; + switch (model.type) { + case MODEL_7B: + Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + break; + case MODEL_13B: + Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N); + Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N); + break; + default: + GGML_ASSERT(false); } - // merge the right sym into the left one - left_sym.n += right_sym.n; - right_sym.n = 0; - - //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); + offload_func_kq(Kcur); + ggml_set_name(Kcur, "Kcur"); - // remove the right sym from the chain - left_sym.next = right_sym.next; - if (right_sym.next >= 0) { - symbols_[right_sym.next].prev = bigram.left; - } + offload_func_kq(Qcur); + ggml_set_name(Qcur, "Qcur"); - // find more substitutions - try_add_bigram(left_sym.prev, bigram.left); - try_add_bigram(bigram.left, left_sym.next); - } + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix - for (int i = 0; i != -1; i = symbols_[i].next) { - auto & symbol = symbols_[i]; - auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n)); + struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + offload_func_v(tmpv); + ggml_set_name(tmpv, "tmpv"); - if (token == vocab_.token_to_id.end()) { - // output any symbols that did not form tokens as bytes. - for (int j = 0; j < (int) symbol.n; ++j) { - // NOTE: old version, before #2420 - not sure what are the implications of this - //llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; - llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j])); - output.push_back(token_id); - } - } else { - output.push_back((*token).second); - } - } - } + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); + offload_func_v(Vcur); + ggml_set_name(Vcur, "Vcur"); -private: - void try_add_bigram(int left, int right) { - if (left == -1 || right == -1) { - return; - } + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + offload_func_kq(k); + ggml_set_name(k, "k"); - const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n); - auto token = vocab_.token_to_id.find(text); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); + offload_func_v(v); + ggml_set_name(v, "v"); - if (token == vocab_.token_to_id.end()) { - return; - } + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } - if (static_cast((*token).second) >= vocab_.id_to_token.size()) { - return; - } + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + offload_func_kq(Q); + ggml_set_name(Q, "Q"); - const auto &tok_score = vocab_.id_to_token[(*token).second]; + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); + ggml_set_name(K, "K"); - llama_sp_bigram bigram; - bigram.left = left; - bigram.right = right; - bigram.score = tok_score.score; - bigram.size = text.size(); - work_queue_.push(bigram); - } + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + ggml_set_name(KQ, "KQ"); - const llama_vocab & vocab_; - std::vector symbols_; - llama_sp_bigram::queue work_queue_; -}; + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); + ggml_set_name(KQ_scaled, "KQ_scaled"); -static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) { - llama_tokenizer tokenizer(vocab); - std::vector output; + struct ggml_tensor * KQ_masked; + struct ggml_tensor * KQ_scaled_alibi; - if (text.empty()) { - return output; - } + switch (model.type) { + case MODEL_7B: + KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + break; + case MODEL_13B: + KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8); + ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); + break; + default: + GGML_ASSERT(false); + } + // KQ_masked = mask_past(KQ_scaled) + // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past); + // offload_func_kq(KQ_masked); + // ggml_set_name(KQ_masked, "KQ_masked"); - if (bos) { - output.push_back(llama_token_bos()); - } + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + offload_func_v(V); + ggml_set_name(V, "V"); + +#if 1 + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + ggml_set_name(KQV, "KQV"); +#else + // make V contiguous in memory to speed up the matmul, however we waste time on the copy + // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation + // is there a better way? + struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); +#endif + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + ggml_set_name(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + offload_func_v(cur); + ggml_set_name(cur, "KQV_merged_contiguous"); + + // projection (no bias) + cur = ggml_mul_mat(ctx0, + model.layers[il].wo, + cur); + offload_func(cur); + ggml_set_name(cur, "result_wo"); + } + + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + offload_func(inpFF); + ggml_set_name(inpFF, "inpFF"); + + // feed-forward network + { + // norm + { + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); + offload_func(cur); + ggml_set_name(cur, "rms_norm_1"); + + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + offload_func(cur); + ggml_set_name(cur, "ffn_norm"); + } + + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model.layers[il].w3, + cur); + offload_func(tmp); + ggml_set_name(tmp, "result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w1, + cur); + offload_func(cur); + ggml_set_name(cur, "result_w1"); + + // SILU activation + cur = ggml_silu(ctx0, cur); + offload_func(cur); + ggml_set_name(cur, "silu"); + + cur = ggml_mul(ctx0, cur, tmp); + offload_func(cur); + ggml_set_name(cur, "silu_x_result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w2, + cur); + offload_func(cur); + ggml_set_name(cur, "result_w2"); + } + + cur = ggml_add(ctx0, cur, inpFF); + offload_func(cur); + ggml_set_name(cur, "inpFF_+_result_w2"); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + { + cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); + offload_func_nr(cur); + ggml_set_name(cur, "rms_norm_2"); + + // cur = cur*norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.output_norm); + // offload_func_nr(cur); // TODO CPU + GPU mirrored backend + ggml_set_name(cur, "result_norm"); + } + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + +static struct ggml_cgraph * llm_build_falcon( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past) { + + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_eps = hparams.f_norm_eps; + + const int n_gpu_layers = model.n_gpu_layers; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + if (tokens) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } + ggml_set_name(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } + } + + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; + + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + // + // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal + // in that case ggml_cuda_assign_buffers has no effect + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + // self-attention + // TODO: refactor into common function (shared with LLaMA) + { + attn_norm = ggml_norm(ctx0, inpL, norm_eps); + offload_func(attn_norm); + + attn_norm = ggml_add(ctx0, + ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), + model.layers[il].attn_norm_b); + offload_func(attn_norm->src[0]); + offload_func(attn_norm); + + if (model.layers[il].attn_norm_2) { // Falcon-40B + cur = ggml_norm(ctx0, inpL, norm_eps); + offload_func(cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), + model.layers[il].attn_norm_2_b); + offload_func(cur->src[0]); + offload_func(cur); + } else { // Falcon 7B + cur = attn_norm; + } + + // compute QKV + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + offload_func_kq(cur); + + // Note that the strides for Kcur, Vcur are set up so that the + // resulting views are misaligned with the tensor's storage + // (by applying the K/V offset we shift the tensor's original + // view to stick out behind the viewed QKV tensor's allocated + // memory, so to say). This is ok because no actual accesses + // happen to that out-of-range memory, but it can require some + // trickery when trying to accurately dump these views for + // debugging. + + const size_t wsize = ggml_type_size(cur->type); + + // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for + // non-contiguous views is added for the rope operator + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( + ctx0, cur, n_embd_head, n_head, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + 0)); + offload_func_kq(tmpq); + + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * n_head)); + offload_func_kq(tmpk); + + struct ggml_tensor * tmpv = ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, N, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * (n_head + n_head_kv)); + offload_func_v(tmpv); + + // using mode = 2 for neox mode + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale); + offload_func_kq(Qcur); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale); + offload_func_kq(Kcur); + + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); + offload_func_v(Vcur); + offload_func_v(Vcur->src[0]->src[0]); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + offload_func_kq(k); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); + offload_func_v(v); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + offload_func_kq(Q); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); + ggml_set_name(K, "K"); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + ggml_set_name(KQ, "KQ"); + + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + ggml_set_name(KQ_masked, "KQ_masked"); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + offload_func_v(V); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + ggml_set_name(KQV, "KQV"); + + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + ggml_set_name(KQV_merged, "KQV_merged"); + + cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + offload_func_v(cur); + ggml_set_name(cur, "KQV_merged_contiguous"); + + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + offload_func(cur); + ggml_set_name(cur, "result_wo"); + } + + struct ggml_tensor * attn_out = cur; + + // feed forward + { + struct ggml_tensor * inpFF = attn_norm; + + cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); + offload_func(cur); + + cur = ggml_gelu(ctx0, cur); + offload_func(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + offload_func(cur); + } + + cur = ggml_add(ctx0, cur, attn_out); + offload_func(cur); + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + { + cur = ggml_norm(ctx0, cur, norm_eps); + offload_func_nr(cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.output_norm), + model.output_norm_b); + ggml_set_name(cur, "result_norm"); + } + + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + +static struct ggml_cgraph * llm_build_starcoder( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past) { + + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const float norm_eps = hparams.f_norm_eps; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * token; + struct ggml_tensor * position; + struct ggml_tensor * inpL; + + if (tokens) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } + ggml_set_name(inp_tokens, "inp_tokens"); + + token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + + ggml_allocr_alloc(lctx.alloc, token); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(token->data, embd, N * n_embd * ggml_element_size(token)); + } + } + + { + // Compute position embeddings. + struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(lctx.alloc, inp_positions); + if (!ggml_allocr_is_measure(lctx.alloc)) { + for (int i = 0; i < N; ++i) { + ((int32_t *) inp_positions->data)[i] = n_past + i; + } + } + ggml_set_name(inp_positions, "inp_positions"); + + position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); + } + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + inpL = ggml_add(ctx0, token, position); + ggml_set_name(inpL, "inpL"); + + for (int il = 0; il < n_layer; ++il) { + { + // Norm + cur = ggml_norm(ctx0, inpL, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); + } + + { + // Self Attention + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); + + struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd); + struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); + + struct ggml_tensor * Qcur = tmpq; + struct ggml_tensor * Kcur = tmpk; + + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)), + 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + ggml_set_name(K, "K"); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + ggml_set_name(KQ, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + ggml_set_name(KQ_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + ggml_set_name(KQV, "KQV"); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_set_name(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_set_name(cur, "KQV_merged_contiguous"); + } + + // Projection + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); + + // Add the input + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + // FF + { + // Norm + { + cur = ggml_norm(ctx0, inpFF, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + } + + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); + + // GELU activation + cur = ggml_gelu(ctx0, cur); + + // Projection + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + } + + inpL = ggml_add(ctx0, cur, inpFF); + } + + // Output Norm + { + cur = ggml_norm(ctx0, inpL, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); + } + ggml_set_name(cur, "result_norm"); + + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + + return gf; +} + +static struct ggml_cgraph * llama_build_graph( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past) { + const auto & model = lctx.model; + + struct ggml_cgraph * result = NULL; + + switch (model.arch) { + case LLM_ARCH_LLAMA: + { + result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past); + } break; + case LLM_ARCH_BAICHUAN: + { + result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past); + } break; + case LLM_ARCH_FALCON: + { + result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past); + } break; + case LLM_ARCH_STARCODER: + { + result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past); + } break; + default: + GGML_ASSERT(false); + }; + + return result; +} + +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - embd embeddings input +// - n_tokens number of tokens +// - n_past: the context size so far +// - n_threads: number of threads to use +// +static bool llama_eval_internal( + llama_context & lctx, + const llama_token * tokens, + const float * embd, + int n_tokens, + int n_past, + int n_threads, + const char * cgraph_fname) { + + GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT + + GGML_ASSERT(n_tokens > 0); + GGML_ASSERT(n_past >= 0); + // TODO: keep the values of n_batch and n_ctx + // GGML_ASSERT(n_tokens <= n_batch); + // GGML_ASSERT(n_past + n_tokens <= n_ctx); + + const int64_t t_start_us = ggml_time_us(); + +#ifdef GGML_USE_MPI + ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); +#endif + + GGML_ASSERT(n_threads > 0); + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_vocab = hparams.n_vocab; + + ggml_allocr_reset(lctx.alloc); + + ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); + + ggml_allocr_alloc_graph(lctx.alloc, gf); + +#ifdef GGML_USE_CUBLAS + for (int i = 0; i < gf->n_leafs; i++) { + ggml_tensor * node = gf->leafs[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); + } + } + + for (int i = 0; i < gf->n_nodes; i++) { + ggml_tensor * node = gf->nodes[i]; + if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { + ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); + } + } +#endif + + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well + // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering + // with the BLAS calls. need a better solution + if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { + n_threads = std::min(4, n_threads); + } + + // If all tensors can be run on the GPU then using more than 1 thread is detrimental. + const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA || + model.arch == LLM_ARCH_BAICHUAN || + model.arch == LLM_ARCH_FALCON; + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; + if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { + n_threads = 1; + } + + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + + GGML_ASSERT(strcmp(res->name, "result_output") == 0); + GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + +#if GGML_USE_MPI + const int64_t n_layer = hparams.n_layer; + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); +#endif + +#ifdef GGML_USE_METAL + if (lctx.ctx_metal) { + ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); + ggml_metal_graph_compute(lctx.ctx_metal, gf); + } else { + ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); + } +#else + ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); +#endif + +#if GGML_USE_MPI + ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); +#endif + + // update kv token count + lctx.kv_self.n = n_past + N; + + if (cgraph_fname) { + ggml_graph_export(gf, cgraph_fname); + } + +#ifdef GGML_PERF + // print timing information per ggml operation (for debugging purposes) + // requires GGML_PERF to be defined + ggml_graph_print(gf); +#endif + + // plot the computation graph in dot format (for debugging purposes) + //if (n_past%100 == 0) { + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); + //} + + // extract logits + { + auto & logits_out = lctx.logits; + + if (lctx.logits_all) { + logits_out.resize(n_vocab * N); + memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N); + } else { + // return result for just the last token + logits_out.resize(n_vocab); + memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + } + } + + // extract embeddings + if (!lctx.embedding.empty()) { + auto & embedding_out = lctx.embedding; + + embedding_out.resize(n_embd); + memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + } + + // measure the performance only for the single-token evals + if (N == 1) { + lctx.t_eval_us += ggml_time_us() - t_start_us; + lctx.n_eval++; + } + else if (N > 1) { + lctx.t_p_eval_us += ggml_time_us() - t_start_us; + lctx.n_p_eval += N; + } + + return true; +} + +// +// tokenizer +// + +static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) { + return vocab.type; +} + +static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { + return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; +} + +static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { + return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; +} + +static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { + return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; +} + +static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { + return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; +} + +static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) { + GGML_ASSERT(llama_is_byte_token(vocab, id)); + const auto& token_data = vocab.id_to_token.at(id); + auto buf = token_data.text.substr(3, 2); + return strtol(buf.c_str(), NULL, 16); +} + +static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { + char buf[7]; + int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch); + GGML_ASSERT(0 <= result && result < 7); + return vocab.token_to_id.at(buf); +} + +static void llama_escape_whitespace(std::string & text) { + replace_all(text, " ", "\xe2\x96\x81"); +} + +static void llama_unescape_whitespace(std::string & word) { + replace_all(word, "\xe2\x96\x81", " "); +} + +struct llm_symbol { + using index = int; + index prev; + index next; + const char * text; + size_t n; +}; + +static_assert(std::is_trivially_copyable::value, "llm_symbol is not trivially copyable"); + +// SPM tokenizer +// original implementation: +// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 + +struct llm_bigram_spm { + struct comparator { + bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) { + return (l.score < r.score) || (l.score == r.score && l.left > r.left); + } + }; + using queue_storage = std::vector; + using queue = std::priority_queue; + llm_symbol::index left; + llm_symbol::index right; + float score; + size_t size; +}; + +struct llm_tokenizer_spm { + llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {} + + void tokenize(const std::string & text, std::vector & output) { + // split string into utf8 chars + int index = 0; + size_t offs = 0; + while (offs < text.size()) { + llm_symbol sym; + size_t len = utf8_len(text[offs]); + sym.text = text.c_str() + offs; + sym.n = std::min(len, text.size() - offs); + offs += sym.n; + sym.prev = index - 1; + sym.next = offs == text.size() ? -1 : index + 1; + index++; + symbols.emplace_back(sym); + } + + // seed the work queue with all possible 2-character tokens. + for (size_t i = 1; i < symbols.size(); ++i) { + try_add_bigram(i - 1, i); + } + + // keep substituting the highest frequency pairs for as long as we can. + while (!work_queue.empty()) { + auto bigram = work_queue.top(); + work_queue.pop(); + + auto & left_sym = symbols[bigram.left]; + auto & right_sym = symbols[bigram.right]; + + // if one of the symbols already got merged, skip it. + if (left_sym.n == 0 || right_sym.n == 0 || + left_sym.n + right_sym.n != bigram.size) { + continue; + } + + // merge the right sym into the left one + left_sym.n += right_sym.n; + right_sym.n = 0; + + //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); + + // remove the right sym from the chain + left_sym.next = right_sym.next; + if (right_sym.next >= 0) { + symbols[right_sym.next].prev = bigram.left; + } + + // find more substitutions + try_add_bigram(left_sym.prev, bigram.left); + try_add_bigram(bigram.left, left_sym.next); + } + + for (int i = 0; i != -1; i = symbols[i].next) { + auto & symbol = symbols[i]; + resegment(symbol, output); + } + } + +private: + void resegment(llm_symbol & symbol, std::vector & output) { + auto text = std::string(symbol.text, symbol.n); + auto token = vocab.token_to_id.find(text); + + // Do we need to support is_unused? + if (token != vocab.token_to_id.end()) { + output.push_back((*token).second); + return; + } + + const auto p = rev_merge.find(text); + + if (p == rev_merge.end()) { + // output any symbols that did not form tokens as bytes. + for (int j = 0; j < (int)symbol.n; ++j) { + llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]); + output.push_back(token_id); + } + return; + } + + resegment(symbols[p->second.first], output); + resegment(symbols[p->second.second], output); + } + + void try_add_bigram(int left, int right) { + if (left == -1 || right == -1) { + return; + } + + const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n); + auto token = vocab.token_to_id.find(text); + + if (token == vocab.token_to_id.end()) { + return; + } + + if (static_cast((*token).second) >= vocab.id_to_token.size()) { + return; + } + + const auto & tok_data = vocab.id_to_token[(*token).second]; + + llm_bigram_spm bigram; + bigram.left = left; + bigram.right = right; + bigram.score = tok_data.score; + bigram.size = text.size(); + + work_queue.push(bigram); + + // Do we need to support is_unused? + rev_merge[text] = std::make_pair(left, right); + } + + const llama_vocab & vocab; + + std::vector symbols; + llm_bigram_spm::queue work_queue; + + std::map> rev_merge; +}; + +// BPE tokenizer +// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License] +// tried to simplify unicode stuff, so most likely does not work 100% correctly! + +// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused + +struct llm_bigram_bpe { + struct comparator { + bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const { + return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); + } + }; + + using queue_storage = std::vector; + using queue = std::priority_queue; + llm_symbol::index left; + llm_symbol::index right; + std::string text; + int rank; + size_t size; +}; + +struct llm_tokenizer_bpe { + llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {} + + void tokenize(const std::string & text, std::vector & output) { + int final_prev_index = -1; + auto word_collection = bpe_gpt2_preprocess(text); + + symbols_final.clear(); + + for (auto & word : word_collection) { + work_queue = llm_bigram_bpe::queue(); + symbols.clear(); + + int index = 0; + size_t offset = 0; + + while (offset < word.size()) { + llm_symbol sym; + size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset])); + sym.text = word.c_str() + offset; + sym.n = 1; + sym.n = char_len; + offset += sym.n; + sym.prev = index - 1; + sym.next = offset == word.size() ? -1 : index + 1; + index++; + symbols.emplace_back(sym); + } + for (size_t i = 1; i < symbols.size(); ++i) { + add_new_bigram(i - 1, i); + } + + // build token(s) + while (!work_queue.empty()) { + auto bigram = work_queue.top(); + work_queue.pop(); + + auto & left_symbol = symbols[bigram.left]; + auto & right_symbol = symbols[bigram.right]; + + if (left_symbol.n == 0 || right_symbol.n == 0) { + continue; + } + std::string left_token = std::string(left_symbol.text, left_symbol.n); + std::string right_token = std::string(right_symbol.text, right_symbol.n); + if (left_token + right_token != bigram.text) { + continue; // Skip this bigram if it's outdated + } + + // merge the right sym into the left one + left_symbol.n += right_symbol.n; + right_symbol.n = 0; + + // remove the right sym from the chain + left_symbol.next = right_symbol.next; + if (right_symbol.next >= 0) { + symbols[right_symbol.next].prev = bigram.left; + } + + add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol + add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol + } + + // add the fnished tokens to the final list keeping correct order for next and prev + for (auto & sym : symbols) { + if (sym.n > 0) { + sym.prev = final_prev_index; + sym.next = -1; + if (final_prev_index != -1) { + symbols_final[final_prev_index].next = symbols_final.size(); + } + symbols_final.emplace_back(sym); + final_prev_index = symbols_final.size() - 1; + } + } + } + + symbols = symbols_final; + + if (!symbols.empty()) { + for (int i = 0; i != -1; i = symbols[i].next) { + auto & symbol = symbols[i]; + if (symbol.n == 0) { + continue; + } + + const std::string str = std::string(symbol.text, symbol.n); + const auto token = vocab.token_to_id.find(str); + + if (token == vocab.token_to_id.end()) { + for (auto j = str.begin(); j != str.end(); ++j) { + std::string byte_str(1, *j); + auto token_multibyte = vocab.token_to_id.find(byte_str); + if (token_multibyte == vocab.token_to_id.end()) { + try { + llama_token token_byte = llama_byte_to_token(vocab, *j); + output.push_back(token_byte); + } catch (const std::out_of_range & err) { + fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str()); + } + } else { + output.push_back((*token_multibyte).second); + } + } + } else { + output.push_back((*token).second); + } + } + } + } + +private: + void add_new_bigram(int left, int right) { + if (left == -1 || right == -1) { + return; + } + + std::string left_token = std::string(symbols[left].text, symbols[left].n); + std::string right_token = std::string(symbols[right].text, symbols[right].n); + + int rank_found = -1; + + rank_found = vocab.find_bpe_rank(left_token, right_token); + + if (rank_found < 0) { + return; + } + + llm_bigram_bpe bigram; + + bigram.left = left; + bigram.right = right; + bigram.text = left_token + right_token; + bigram.size = left_token.size() + right_token.size(); + bigram.rank = rank_found; + + work_queue.push(bigram); + } + + // probably not 100% correct + static std::vector bpe_gpt2_preprocess(const std::string & text) { + std::vector words; + + // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 + const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; + const std::regex re(pattern); + + auto words_begin = std::sregex_iterator(text.begin(), text.end(), re); + auto words_end = std::sregex_iterator(); + auto n_words = std::distance(words_begin, words_end); + words.reserve(n_words); + for (auto it = words_begin; it != words_end; ++it) { + words.push_back(it->str()); + } + return words; + + } + + const llama_vocab & vocab; + + std::vector symbols; + std::vector symbols_final; + + llm_bigram_bpe::queue work_queue; +}; + +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) { + std::vector output; + + // OG tokenizer behavior: + // + // tokenizer.encode('', add_bos=True) returns [1] + // tokenizer.encode('', add_bos=False) returns [] + + if (bos && vocab.special_bos_id != -1) { + output.push_back(vocab.special_bos_id); + } + + if (raw_text.empty()) { + return output; + } + + switch (vocab.type) { + case LLAMA_VOCAB_TYPE_SPM: + { + // without adding this leading whitespace, we do not get the same results as the original tokenizer + raw_text = " " + raw_text; + + llm_tokenizer_spm tokenizer(vocab); + llama_escape_whitespace(raw_text); + tokenizer.tokenize(raw_text, output); + } break; + case LLAMA_VOCAB_TYPE_BPE: + { + llm_tokenizer_bpe tokenizer(vocab); + tokenizer.tokenize(raw_text, output); + } break; + }; - tokenizer.tokenize(text, output); return output; } @@ -2094,44 +4290,88 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co // grammar - internal // +struct llama_partial_utf8 { + uint32_t value; // bit value so far (unshifted) + int n_remain; // num bytes remaining; -1 indicates invalid sequence +}; + struct llama_grammar { const std::vector> rules; std::vector> stacks; + + // buffer for partially generated UTF-8 sequence from accepted tokens + llama_partial_utf8 partial_utf8; }; struct llama_grammar_candidate { - size_t index; - const uint32_t * code_points; + size_t index; + const uint32_t * code_points; + llama_partial_utf8 partial_utf8; }; -// NOTE: assumes valid utf8 (but checks for overrun) -// adds a terminating 0 for use as pointer -std::vector decode_utf8(const char * src) { - static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; +// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as +// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. +static std::pair, llama_partial_utf8> decode_utf8( + const char * src, + llama_partial_utf8 partial_start) { + static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; const char * pos = src; std::vector code_points; + uint32_t value = partial_start.value; + int n_remain = partial_start.n_remain; + + // continue previous decode, if applicable + while (*pos != 0 && n_remain > 0) { + uint8_t next_byte = static_cast(*pos); + if ((next_byte >> 6) != 2) { + // invalid sequence, abort + code_points.push_back(0); + return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 }); + } + value = (value << 6) + (next_byte & 0x3F); + ++pos; + --n_remain; + } + + if (partial_start.n_remain > 0 && n_remain == 0) { + code_points.push_back(value); + } + + // decode any subsequent utf-8 sequences, which may end in an incomplete one while (*pos != 0) { uint8_t first_byte = static_cast(*pos); uint8_t highbits = first_byte >> 4; - int len = lookup[highbits]; - uint8_t mask = (1 << (8 - len)) - 1; - uint32_t value = first_byte & mask; - const char * end = pos + len; // may overrun! + n_remain = lookup[highbits] - 1; + + if (n_remain < 0) { + // invalid sequence, abort + code_points.clear(); + code_points.push_back(0); + return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain }); + } + + uint8_t mask = (1 << (7 - n_remain)) - 1; + value = first_byte & mask; ++pos; - for ( ; pos < end && *pos != 0; ++pos) { + while (*pos != 0 && n_remain > 0) { value = (value << 6) + (static_cast(*pos) & 0x3F); + ++pos; + --n_remain; + } + if (n_remain == 0) { + code_points.push_back(value); } - code_points.push_back(value); } code_points.push_back(0); - return code_points; + + return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain }); } // returns true iff pos points to the end of one of the definitions of a rule static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) { switch (pos->type) { - case LLAMA_GRETYPE_END: return true; - case LLAMA_GRETYPE_ALT: return true; + case LLAMA_GRETYPE_END: return true; // NOLINT + case LLAMA_GRETYPE_ALT: return true; // NOLINT default: return false; } } @@ -2144,7 +4384,8 @@ static std::pair llama_grammar_match_char( bool found = false; bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; - LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); + + GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT do { if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) { @@ -2161,6 +4402,56 @@ static std::pair llama_grammar_match_char( return std::make_pair(found == is_positive_char, pos); } +// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char +// range at pos (regular or inverse range) +// asserts that pos is pointing to a char range element +static bool llama_grammar_match_partial_char( + const llama_grammar_element * pos, + const llama_partial_utf8 partial_utf8) { + + bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; + GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); + + uint32_t partial_value = partial_utf8.value; + int n_remain = partial_utf8.n_remain; + + // invalid sequence or 7-bit char split across 2 bytes (overlong) + if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) { + return false; + } + + // range of possible code points this partial UTF-8 sequence could complete to + uint32_t low = partial_value << (n_remain * 6); + uint32_t high = low | ((1 << (n_remain * 6)) - 1); + + if (low == 0) { + if (n_remain == 2) { + low = 1 << 11; + } else if (n_remain == 3) { + low = 1 << 16; + } + } + + do { + if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) { + // inclusive range, e.g. [a-z] + if (pos->value <= high && low <= pos[1].value) { + return is_positive_char; + } + pos += 2; + } else { + // exact char match, e.g. [a] or "a" + if (low <= pos->value && pos->value <= high) { + return is_positive_char; + } + pos += 1; + } + } while (pos->type == LLAMA_GRETYPE_CHAR_ALT); + + return !is_positive_char; +} + + // transforms a grammar pushdown stack into N possible stacks, all ending // at a character range (terminal element) static void llama_grammar_advance_stack( @@ -2169,7 +4460,7 @@ static void llama_grammar_advance_stack( std::vector> & new_stacks) { if (stack.empty()) { - new_stacks.push_back(stack); + new_stacks.emplace_back(stack); return; } @@ -2206,13 +4497,13 @@ static void llama_grammar_advance_stack( } case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR_NOT: - new_stacks.push_back(stack); + new_stacks.emplace_back(stack); break; default: // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on // those - LLAMA_ASSERT(false); + GGML_ASSERT(false); } } @@ -2261,8 +4552,11 @@ static std::vector llama_grammar_reject_candidates_for_ std::vector rejects; if (stack.empty()) { - // accept nothing; EOS is handled elsewhere - rejects.insert(rejects.end(), candidates.begin(), candidates.end()); + for (auto tok : candidates) { + if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) { + rejects.push_back(tok); + } + } return rejects; } @@ -2270,16 +4564,21 @@ static std::vector llama_grammar_reject_candidates_for_ std::vector next_candidates; for (auto tok : candidates) { - if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) { - if (tok.code_points[1] != 0) { - next_candidates.push_back({ tok.index, tok.code_points + 1 }); + if (*tok.code_points == 0) { + // reached end of full codepoints in token, reject iff it ended in a partial sequence + // that cannot satisfy this position in grammar + if (tok.partial_utf8.n_remain != 0 && + !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) { + rejects.push_back(tok); } + } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) { + next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 }); } else { rejects.push_back(tok); } } - auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second; + const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second; // update top of stack to next element, if any std::vector stack_after(stack.begin(), stack.end() - 1); @@ -2291,7 +4590,7 @@ static std::vector llama_grammar_reject_candidates_for_ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates); for (auto tok : next_rejects) { - rejects.push_back({ tok.index, tok.code_points - 1 }); + rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 }); } return rejects; @@ -2301,7 +4600,7 @@ static std::vector llama_grammar_reject_candidates( const std::vector> & rules, const std::vector> & stacks, const std::vector & candidates) { - LLAMA_ASSERT(!stacks.empty()); // REVIEW + GGML_ASSERT(!stacks.empty()); // REVIEW if (candidates.empty()) { return std::vector(); @@ -2356,19 +4655,38 @@ struct llama_grammar * llama_grammar_init( } } while (true); - return new llama_grammar{ std::move(vec_rules), std::move(stacks) }; + return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} }; } void llama_grammar_free(struct llama_grammar * grammar) { delete grammar; } +struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) { + llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 }; + + // redirect elements in stacks to point to new rules + for (size_t is = 0; is < result->stacks.size(); is++) { + for (size_t ie = 0; ie < result->stacks[is].size(); ie++) { + for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) { + for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) { + if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) { + result->stacks[is][ie] = &result->rules[ir0][ir1]; + } + } + } + } + } + + return result; +} + // // sampling // void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) { - assert(candidates->size > 0); + GGML_ASSERT(candidates->size > 0); const int64_t t_start_sample_us = ggml_time_us(); @@ -2474,7 +4792,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * // Calculate absolute value of second derivatives for (size_t i = 0; i < second_derivatives.size(); ++i) { - second_derivatives[i] = abs(second_derivatives[i]); + second_derivatives[i] = std::abs(second_derivatives[i]); } // Normalize the second derivatives @@ -2512,7 +4830,6 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * } } - void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { // Reference implementation: // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr @@ -2527,7 +4844,10 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c float entropy = 0.0f; for (size_t i = 0; i < candidates->size; ++i) { - entropy += -candidates->data[i].p * logf(candidates->data[i].p); + if(candidates->data[i].p>0) + { + entropy += -candidates->data[i].p * logf(candidates->data[i].p); + } } // Compute the absolute difference between negative log probability and entropy for each candidate @@ -2649,7 +4969,7 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l } void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) { - assert(ctx); + GGML_ASSERT(ctx); const int64_t t_start_sample_us = ggml_time_us(); bool allow_eos = false; @@ -2660,29 +4980,28 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - const llama_token eos = llama_token_eos(); + const llama_token eos = llama_token_eos(ctx); - std::vector> candidates_decoded; - std::vector candidates_grammar; + std::vector, llama_partial_utf8>> candidates_decoded; + std::vector candidates_grammar; for (size_t i = 0; i < candidates->size; ++i) { - const llama_token id = candidates->data[i].id; - const char * str = llama_token_to_str(ctx, id); + const llama_token id = candidates->data[i].id; + const std::string piece = llama_token_to_str(ctx, id); if (id == eos) { if (!allow_eos) { candidates->data[i].logit = -INFINITY; } - } else if (*str == 0) { + } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; } else { - candidates_decoded.push_back(decode_utf8(str)); - candidates_grammar.push_back({ i, candidates_decoded.back().data() }); + candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8)); + candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); } } - const auto rejects = - llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); - for (auto & reject : rejects) { + const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); + for (const auto & reject : rejects) { candidates->data[reject.index].logit = -INFINITY; } @@ -2710,10 +5029,12 @@ void llama_sample_classifier_free_guidance( float scale) { int64_t t_start_sample_us = ggml_time_us(); - assert(ctx); + GGML_ASSERT(ctx); + auto n_vocab = llama_n_vocab(ctx); - assert(n_vocab == (int)candidates->size); - assert(!candidates->sorted); + + GGML_ASSERT(n_vocab == (int)candidates->size); + GGML_ASSERT(!candidates->sorted); std::vector logits_base; logits_base.reserve(candidates->size); @@ -2737,7 +5058,8 @@ void llama_sample_classifier_free_guidance( } llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { - assert(ctx); + GGML_ASSERT(ctx); + auto N = float(llama_n_vocab(ctx)); int64_t t_start_sample_us; t_start_sample_us = ggml_time_us(); @@ -2843,7 +5165,8 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da } llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { - assert(ctx); + GGML_ASSERT(ctx); + const int64_t t_start_sample_us = ggml_time_us(); llama_sample_softmax(nullptr, candidates); @@ -2853,80 +5176,342 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra probs.push_back(candidates->data[i].p); } - std::discrete_distribution<> dist(probs.begin(), probs.end()); - auto & rng = ctx->rng; - int idx = dist(rng); + std::discrete_distribution<> dist(probs.begin(), probs.end()); + auto & rng = ctx->rng; + int idx = dist(rng); + + llama_token result = candidates->data[idx].id; + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + return result; +} + +void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { + const int64_t t_start_sample_us = ggml_time_us(); + + if (token == llama_token_eos(ctx)) { + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + return; + } + } + GGML_ASSERT(false); + } + + const std::string piece = llama_token_to_str(ctx, token); + + // Note terminating 0 in decoded string + const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); + const auto & code_points = decoded.first; + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + } + grammar->partial_utf8 = decoded.second; + GGML_ASSERT(!grammar->stacks.empty()); + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; +} + +// +// Beam search +// + +struct llama_beam { + std::vector tokens; + float p; // Cumulative beam probability (renormalized relative to all beams) + bool eob; // Initialize end-of-beam to false. Callback sets this to true. + // Sort beams by probability. In case of ties, prefer beams at eob. + bool operator<(const llama_beam & rhs) const { + return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob); + } + // Shift off first n tokens and discard them. + void shift_tokens(const size_t n) { + if (n) { + std::copy(tokens.begin() + n, tokens.end(), tokens.begin()); + tokens.resize(tokens.size() - n); + } + } + llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; } +}; + +// A struct for calculating logit-related info. +struct llama_logit_info { + const float * const logits; + const int n_vocab; + const float max_l; + const float normalizer; + struct sum_exp { + float max_l; + float operator()(float sum, float l) const { return sum + std::exp(l - max_l); } + }; + llama_logit_info(llama_context * ctx) + : logits(llama_get_logits(ctx)) + , n_vocab(llama_n_vocab(ctx)) + , max_l(*std::max_element(logits, logits + n_vocab)) + , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l})) + { } + llama_token_data get_token_data(const llama_token token_id) const { + constexpr auto p = std::numeric_limits::quiet_NaN(); // never used + return {token_id, logits[token_id], p}; + } + // Return top k token_data by logit. + std::vector top_k(size_t k) { + std::vector min_heap; // min-heap by logit + const llama_token k_min = std::min(static_cast(k), n_vocab); + min_heap.reserve(k_min); + for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) { + min_heap.push_back(get_token_data(token_id)); + } + auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; + std::make_heap(min_heap.begin(), min_heap.end(), comp); + for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) { + if (min_heap.front().logit < logits[token_id]) { + std::pop_heap(min_heap.begin(), min_heap.end(), comp); + min_heap.back().id = token_id; + min_heap.back().logit = logits[token_id]; + std::push_heap(min_heap.begin(), min_heap.end(), comp); + } + } + return min_heap; + } + float probability_from_logit(float logit) const { + return normalizer * std::exp(logit - max_l); + } +}; + +struct llama_beam_search_data { + llama_context * ctx; + size_t n_beams; + int n_past; + int n_predict; + int n_threads; + std::vector beams; + std::vector next_beams; + + // Re-calculated on each loop iteration + size_t common_prefix_length; + + // Used to communicate to/from callback on beams state. + std::vector beam_views; + + llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads) + : ctx(ctx) + , n_beams(n_beams) + , n_past(n_past) + , n_predict(n_predict) + , n_threads(n_threads) + , beam_views(n_beams) { + beams.reserve(n_beams); + next_beams.reserve(n_beams); + } + + // Collapse beams to a single beam given by index. + void collapse_beams(const size_t beam_idx) { + if (0u < beam_idx) { + std::swap(beams[0], beams[beam_idx]); + } + beams.resize(1); + } + + // Min-heaps are used to efficiently collect the top-k elements (k=n_beams). + // The repetative patterns below reflect the 2 stages of heaps: + // * Gather elements until the vector is full, then call std::make_heap() on it. + // * If the heap is full and a new element is found that should be included, pop the + // least element to the back(), replace it with the new, then push it into the heap. + void fill_next_beams_by_top_probabilities(llama_beam & beam) { + // Min-heaps use a greater-than comparator. + const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; }; + if (beam.eob) { + // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough. + if (next_beams.size() < n_beams) { + next_beams.push_back(std::move(beam)); + if (next_beams.size() == n_beams) { + std::make_heap(next_beams.begin(), next_beams.end(), comp); + } + } else if (next_beams.front().p < beam.p) { + std::pop_heap(next_beams.begin(), next_beams.end(), comp); + next_beams.back() = std::move(beam); + std::push_heap(next_beams.begin(), next_beams.end(), comp); + } + } else { + // beam is not at end-of-sentence, so branch with next top_k tokens. + if (!beam.tokens.empty()) { + llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads); + } + llama_logit_info logit_info(ctx); + std::vector next_tokens = logit_info.top_k(n_beams); + size_t i=0; + if (next_beams.size() < n_beams) { + for (; next_beams.size() < n_beams ; ++i) { + llama_beam next_beam = beam; + next_beam.tokens.push_back(next_tokens[i].id); + next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit); + next_beams.push_back(std::move(next_beam)); + } + std::make_heap(next_beams.begin(), next_beams.end(), comp); + } else { + for (; next_beams.front().p == 0.0f ; ++i) { + std::pop_heap(next_beams.begin(), next_beams.end(), comp); + next_beams.back() = beam; + next_beams.back().tokens.push_back(next_tokens[i].id); + next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit); + std::push_heap(next_beams.begin(), next_beams.end(), comp); + } + } + for (; i < n_beams ; ++i) { + const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit); + if (next_beams.front().p < next_p) { + std::pop_heap(next_beams.begin(), next_beams.end(), comp); + next_beams.back() = beam; + next_beams.back().tokens.push_back(next_tokens[i].id); + next_beams.back().p = next_p; + std::push_heap(next_beams.begin(), next_beams.end(), comp); + } + } + } + } + + // Find common_prefix_length based on beams. + // Requires beams is not empty. + size_t find_common_prefix_length() { + size_t common_prefix_length = beams[0].tokens.size(); + for (size_t i = 1 ; i < beams.size() ; ++i) { + common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size()); + for (size_t j = 0 ; j < common_prefix_length ; ++j) { + if (beams[0].tokens[j] != beams[i].tokens[j]) { + common_prefix_length = j; + break; + } + } + } + return common_prefix_length; + } + + // Construct beams_state to send back to caller via the callback function. + // Side effect: set common_prefix_length = find_common_prefix_length(); + llama_beams_state get_beams_state(const bool last_call) { + for (size_t i = 0 ; i < beams.size() ; ++i) { + beam_views[i] = beams[i].view(); + } + common_prefix_length = find_common_prefix_length(); + return {beam_views.data(), beams.size(), common_prefix_length, last_call}; + } + + // Loop: + // * while i < n_predict, AND + // * any of the beams have not yet reached end-of-beam (eob), AND + // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence + // (since all other beam probabilities can only decrease) + void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) { + beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob. + const auto not_eob = [](const llama_beam & beam) { return !beam.eob; }; + for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) && + !beams[top_beam_index()].eob ; ++i) { + callback(callback_data, get_beams_state(false)); // Sets common_prefix_length + update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed. + if (common_prefix_length) { + llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads); + n_past += common_prefix_length; + } + // Zero-out next_beam probabilities to place them last in following min-heap. + std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; }); + for (llama_beam & beam : beams) { + beam.shift_tokens(common_prefix_length); + fill_next_beams_by_top_probabilities(beam); + } + // next_beams become the beams of next/final iteration. Swap them to re-use memory. + beams.swap(next_beams); + renormalize_beam_probabilities(beams); + } + collapse_beams(top_beam_index()); + callback(callback_data, get_beams_state(true)); + } + + // As beams grow, the cumulative probabilities decrease. + // Renormalize them to avoid floating point underflow. + static void renormalize_beam_probabilities(std::vector & beams) { + const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; }; + const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p); + std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; }); + } - llama_token result = candidates->data[idx].id; + // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering. + size_t top_beam_index() { + return std::max_element(beams.begin(), beams.end()) - beams.begin(); + } - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - ctx->n_sample++; - return result; -} + // Copy (p,eob) for each beam which may have been changed by the callback. + void update_beams_from_beam_views() { + for (size_t i = 0 ; i < beams.size() ; ++i) { + beams[i].p = beam_views[i].p; + beams[i].eob = beam_views[i].eob; + } + } +}; -void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { +void llama_beam_search(llama_context * ctx, + llama_beam_search_callback_fn_t callback, void * callback_data, + size_t n_beams, int n_past, int n_predict, int n_threads) { + assert(ctx); const int64_t t_start_sample_us = ggml_time_us(); - if (token == llama_token_eos()) { - for (const auto & stack : grammar->stacks) { - if (stack.empty()) { - return; - } - } - LLAMA_ASSERT(false); - } + llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads); - const char * str = llama_token_to_str(ctx, token); - // Note terminating 0 in decoded string - auto code_points = decode_utf8(str); - for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); - } - LLAMA_ASSERT(!grammar->stacks.empty()); + beam_search_data.loop(callback, callback_data); ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; } // // quantization // -static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) { - if (output.size < nelements * sizeof(float)) { - output.resize(nelements * sizeof(float)); +template +struct no_init { + T value; + no_init() { /* do nothing */ } +}; + +static void llama_convert_tensor_internal( + struct ggml_tensor * tensor, std::vector> & output, std::vector & workers, + const size_t nelements, const int nthread +) { + if (output.size() < nelements) { + output.resize(nelements); } - float * f32_output = (float *) output.addr; + float * f32_output = (float *) output.data(); ggml_type_traits_t qtype; - if (ggml_is_quantized(tensor.type)) { - qtype = ggml_internal_get_type_traits(tensor.type); + if (ggml_is_quantized(tensor->type)) { + qtype = ggml_internal_get_type_traits(tensor->type); if (qtype.to_float == NULL) { - throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); + throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); } - } else if (tensor.type != GGML_TYPE_F16) { - throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); + } else if (tensor->type != GGML_TYPE_F16) { + throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type))); } if (nthread < 2) { - if (tensor.type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements); - } else if (ggml_is_quantized(tensor.type)) { - qtype.to_float(tensor.data, f32_output, nelements); + if (tensor->type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); + } else if (ggml_is_quantized(tensor->type)) { + qtype.to_float(tensor->data, f32_output, nelements); } else { - LLAMA_ASSERT(false); // unreachable + GGML_ASSERT(false); // unreachable } return; } - auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type); - auto block_size_bytes = ggml_type_size(tensor.type); + auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type); + auto block_size_bytes = ggml_type_size(tensor->type); - LLAMA_ASSERT(nelements % block_size == 0); + GGML_ASSERT(nelements % block_size == 0); auto nblocks = nelements / block_size; auto blocks_per_thread = nblocks / nthread; auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count - std::vector workers; for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) { auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread auto thr_elems = thr_blocks * block_size; // number of elements for this thread @@ -2939,20 +5524,127 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam qtype.to_float(inbuf, outbuf, nels); } }; - workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); + workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems); in_buff_offs += thr_block_bytes; out_buff_offs += thr_elems; } - for (auto & worker : workers) { - worker.join(); + for (auto & w : workers) { w.join(); } + workers.clear(); +} + +#ifdef GGML_USE_K_QUANTS +static ggml_type get_k_quant_type( + ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv, + int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2 +) { + const std::string name = ggml_get_name(tensor); + // TODO: avoid hardcoded tensor names - use the TN_* constants + const auto tn = LLM_TN(model.arch); + + auto use_more_bits = [](int i_layer, int num_layers) -> bool { + return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; + }; + + if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { + int nx = tensor->ne[0]; + if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + new_type = GGML_TYPE_Q8_0; + } + else if (new_type != GGML_TYPE_Q8_0) { + new_type = GGML_TYPE_Q6_K; + } + } else if (name.find("attn_v.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && + use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && + (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; + if (model.type == MODEL_70B) { + // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is + // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with + // nearly negligible increase in model size by quantizing this tensor with more bits: + if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; + } + ++*i_attention_wv; + } else if (name.find("ffn_down.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K + : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (model.arch == LLM_ARCH_FALCON) { + new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) { + new_type = GGML_TYPE_Q5_K; + } + ++*i_feed_forward_w2; + } else if (name.find("attn_output.weight") != std::string::npos) { + if (model.arch != LLM_ARCH_FALCON) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + } + } + else if (name.find("attn_qkv.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } + else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + } + // This can be used to reduce the size of the Q5_K_S model. + // The associated PPL increase is fully in line with the size reduction + //else { + // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; + //} + bool convert_incompatible_tensor = false; + if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || + new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { + int nx = tensor->ne[0]; + int ny = tensor->ne[1]; + if (nx % QK_K != 0) { + LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); + convert_incompatible_tensor = true; + } + } + if (convert_incompatible_tensor) { + if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { + new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. + LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); + } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { + new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. + LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); + } else { + throw std::runtime_error("Unsupported tensor size encountered\n"); + } } + return new_type; } +#endif static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type quantized_type; llama_ftype ftype = params->ftype; - int nthread = params->nthread; switch (params->ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; @@ -2978,227 +5670,607 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } + int nthread = params->nthread; + if (nthread <= 0) { nthread = std::thread::hardware_concurrency(); } - std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false)); - llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype); + std::unique_ptr ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + + llama_model model; + llm_load_arch(*ml, model); + llm_load_hparams(*ml, model, 0, 0, 0); + + if (params->only_copy) { + ftype = model.ftype; + } + + const size_t align = GGUF_DEFAULT_ALIGNMENT; + struct gguf_context * ctx_out = gguf_init_empty(); + + // copy the KV pairs from the input file + gguf_set_kv (ctx_out, ml->ctx_gguf); + gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out, "general.file_type", ftype); + +#ifdef GGML_USE_K_QUANTS + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + + for (int i = 0; i < ml->n_tensors; ++i) { + struct ggml_tensor * meta = ml->get_tensor_meta(i); + + const std::string name = ggml_get_name(meta); + + // TODO: avoid hardcoded tensor names - use the TN_* constants + if (name.find("attn_v.weight") != std::string::npos) { + ++n_attention_wv; + } + else if (name.find("ffn_down.weight") != std::string::npos) { + ++n_feed_forward_w2; + } + } + if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) { + LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", + __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer); + } + + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; +#endif + + size_t total_size_org = 0; + size_t total_size_new = 0; + std::vector hist_all(1 << 4, 0); + + std::vector workers; + workers.reserve(nthread); + std::mutex mutex; + + int idx = 0; + + std::vector> read_data; + std::vector> work; + std::vector> f32_conv_buf; + + // populate the original tensors so we get an initial meta data + for (int i = 0; i < ml->n_tensors; ++i) { + struct ggml_tensor * meta = ml->get_tensor_meta(i); + gguf_add_tensor(ctx_out, meta); + } + + std::ofstream fout(fname_out, std::ios::binary); + + const size_t meta_size = gguf_get_meta_size(ctx_out); + + LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); + + // placeholder for the meta data + ::zeros(fout, meta_size); + + for (int i = 0; i < ml->n_tensors; ++i) { + struct ggml_tensor * tensor = ml->get_tensor_meta(i); + + const std::string name = ggml_get_name(tensor); + + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); + ml->load_data_for(tensor); + + LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", + ++idx, ml->n_tensors, + ggml_get_name(tensor), + llama_format_tensor_shape(tensor).c_str(), + ggml_type_name(tensor->type)); + + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? + + // quantize only 2D tensors + quantize &= (tensor->n_dims == 2); + quantize &= params->quantize_output_tensor || name != "output.weight"; + quantize &= !params->only_copy; + + enum ggml_type new_type; + void * new_data; + size_t new_size; + + if (quantize) { + new_type = quantized_type; +#ifdef GGML_USE_K_QUANTS + new_type = get_k_quant_type( + new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2 + ); +#endif + // If we've decided to quantize to the same type the tensor is already + // in then there's nothing to do. + quantize = tensor->type != new_type; + } + if (!quantize) { + new_type = tensor->type; + new_data = tensor->data; + new_size = ggml_nbytes(tensor); + LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); + } else { + const size_t nelements = ggml_nelements(tensor); + + float * f32_data; + + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { + throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); + } else { + llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread); + f32_data = (float *) f32_conv_buf.data(); + } + + LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type)); + fflush(stdout); + + if (work.size() < nelements * 4) { + work.resize(nelements * 4); // upper bound on size + } + new_data = work.data(); + std::array hist_cur = {}; + + static const int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1)/chunk_size; + const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() { + std::array local_hist = {}; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + size_t first = counter; counter += chunk_size; + if (first >= nelements) { + if (local_size > 0) { + for (int j=0; j %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); + int64_t tot_count = 0; + for (size_t i = 0; i < hist_cur.size(); i++) { + hist_all[i] += hist_cur[i]; + tot_count += hist_cur[i]; + } + + if (tot_count > 0) { + for (size_t i = 0; i < hist_cur.size(); i++) { + LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements)); + } + } + LLAMA_LOG_INFO("\n"); + } + total_size_org += ggml_nbytes(tensor); + total_size_new += new_size; + + // update the gguf meta data as we go + gguf_set_tensor_type(ctx_out, name.c_str(), new_type); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + + // write tensor data + padding + fout.write((const char *) new_data, new_size); + zeros(fout, GGML_PAD(new_size, align) - new_size); + } + + // go back to beginning of file and write the updated meta data + { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *) data.data(), data.size()); + } + + fout.close(); + + gguf_free(ctx_out); + + LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + + // print histogram for all tensors + { + int64_t sum_all = 0; + for (size_t i = 0; i < hist_all.size(); i++) { + sum_all += hist_all[i]; + } + + if (sum_all > 0) { + LLAMA_LOG_INFO("%s: hist: ", __func__); + for (size_t i = 0; i < hist_all.size(); i++) { + LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all)); + } + LLAMA_LOG_INFO("\n"); + } + } +} + +// TODO: after the GGUF PR, this likely won't work and needs to be updated +static int llama_apply_lora_from_file_internal( + const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads +) { + LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + + const int64_t t_start_lora_us = ggml_time_us(); + + auto fin = std::ifstream(path_lora, std::ios::binary); + if (!fin) { + LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora); + return 1; + } + + // verify magic and version + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + uint32_t format_version; + fin.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); + return 1; + } + } + + int32_t lora_r; + int32_t lora_alpha; + fin.read((char *) &lora_r, sizeof(lora_r)); + fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + float scaling = (float)lora_alpha / (float)lora_r; + + LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + + // create a temporary ggml context to store the lora tensors + // todo: calculate size from biggest possible tensor + std::vector lora_buf(1024ull * 1024ull * 1024ull); + struct ggml_init_params params; + params.mem_size = lora_buf.size(); + params.mem_buffer = lora_buf.data(); + params.no_alloc = false; + + ggml_context * lora_ctx = ggml_init(params); + std::unordered_map lora_tensors; + + // create a name -> tensor map of the model to accelerate lookups + std::unordered_map model_tensors; + for (const auto & kv : model.tensors_by_name) { + model_tensors.insert(kv); + } + + // load base model + std::unique_ptr ml; + ggml_context * base_ctx = NULL; + std::vector base_buf; + if (path_base_model) { + LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); + + size_t ctx_size; + size_t mmapped_size; + ml->calc_sizes(ctx_size, mmapped_size); + base_buf.resize(ctx_size); + + ggml_init_params base_params; + base_params.mem_size = base_buf.size(); + base_params.mem_buffer = base_buf.data(); + base_params.no_alloc = ml->use_mmap; + + base_ctx = ggml_init(base_params); + + // maybe this should in llama_model_loader + if (ml->use_mmap) { + ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); + } + } + + // read tensors and apply + bool warned = false; + int n_tensors = 0; + + std::vector work_buffer; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; -#ifdef GGML_USE_K_QUANTS - int n_attention_wv = 0; - int n_feed_forward_w2 = 0; - for (auto& tensor : model_loader->tensors_map.tensors) { - if (tensor.name.find("attention.wv.weight") != std::string::npos) { - ++n_attention_wv; + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ftype), sizeof(ftype)); + if (fin.eof()) { + break; } - else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { - ++n_feed_forward_w2; + + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); } - } - int i_attention_wv = 0; - int i_feed_forward_w2 = 0; -#endif + std::string name; + { + char buf[1024]; + fin.read(buf, length); + name = std::string(buf, length); + } - size_t total_size_org = 0; - size_t total_size_new = 0; - std::vector hist_all(1 << 4, 0); + // check for lora suffix and get the type of tensor + const std::string lora_suffix = ".lora"; + size_t pos = name.rfind(lora_suffix); + if (pos == std::string::npos) { + LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); + return 1; + } - std::vector workers; - std::mutex mutex; + std::string lora_type = name.substr(pos + lora_suffix.length()); + std::string base_name = name; + base_name.erase(pos); + // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); - auto use_more_bits = [] (int i_layer, int num_layers) -> bool { - return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; - }; + if (model_tensors.find(base_name) == model_tensors.end()) { + LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + return 1; + } + + // create ggml tensor + ggml_type wtype; + switch (ftype) { + case 0: wtype = GGML_TYPE_F32; break; + case 1: wtype = GGML_TYPE_F16; break; + default: + { + LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", + __func__, ftype); + return false; + } + } + ggml_tensor * lora_tensor; + if (n_dims == 2) { + lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); + } + else { + LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; + } + ggml_set_name(lora_tensor, "lora_tensor"); - size_t idx = 0; - for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { - llama_buffer read_data; - read_data.resize(tensor.size); - tensor.data = read_data.addr; - model_loader->load_data_for(tensor); + // load tensor data + size_t offset = fin.tellg(); + size_t tensor_data_size = ggml_nbytes(lora_tensor); + offset = (offset + 31) & -32; + fin.seekg(offset); + fin.read((char*)lora_tensor->data, tensor_data_size); - printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", - ++idx, model_loader->tensors_map.tensors.size(), - tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), - ggml_type_name(tensor.type)); + lora_tensors[name] = lora_tensor; - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? + // check if we have both A and B tensors and apply + if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && + lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { - // quantize only 2D tensors - quantize &= (tensor.ne.size() == 2); - quantize &= params->quantize_output_tensor || tensor.name != "output.weight"; - quantize &= quantized_type != tensor.type; + ggml_tensor * dest_t = model_tensors[base_name]; - enum ggml_type new_type; - void * new_data; - size_t new_size; - llama_buffer work; + offload_func_t offload_func = llama_nop; + offload_func_t offload_func_force_inplace = llama_nop; - if (!quantize) { - new_type = tensor.type; - new_data = tensor.data; - new_size = tensor.size; - printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); - } else { - new_type = quantized_type; -#ifdef GGML_USE_K_QUANTS - if (tensor.name == "output.weight") { - int nx = tensor.ne.at(0); - int ny = tensor.ne.at(1); - if (nx % QK_K == 0 && ny % QK_K == 0) { - new_type = GGML_TYPE_Q6_K; +#ifdef GGML_USE_CUBLAS + if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { + if (dest_t->type != GGML_TYPE_F16) { + throw std::runtime_error(format( + "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); } - } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && - (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; - ++i_attention_wv; - } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; - ++i_feed_forward_w2; - } else if (tensor.name.find("attention.wo.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + offload_func = ggml_cuda_assign_buffers; + offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; } - bool convert_incompatible_tensor = false; - if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { - int nx = tensor.ne.at(0); - int ny = tensor.ne.at(1); - if (nx % QK_K != 0 || ny % QK_K != 0) { - fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); - convert_incompatible_tensor = true; +#endif // GGML_USE_CUBLAS + + ggml_tensor * base_t; + if (ml) { + struct gguf_context * ctx_gguf = ml->ctx_gguf; + + // load from base model + if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { + // TODO: throw + LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; } + + // TODO: not tested!! maybe not working! + base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + ml->load_data_for(base_t); + } else { + base_t = dest_t; } - if (convert_incompatible_tensor) { - if (tensor.name == "output.weight") { - new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. - fprintf(stderr, "F16 will be used for this tensor instead.\n"); - } else if (tensor.name == "tok_embeddings.weight") { - new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. - fprintf(stderr, "Q4_0 will be used for this tensor instead.\n"); - } else { - throw std::runtime_error("Unsupported tensor size encountered\n"); + + if (ggml_is_quantized(base_t->type)) { + if (!warned) { + LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; } } -#endif - float * f32_data; - size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); - llama_buffer f32_conv_buf; + ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; + GGML_ASSERT(loraA->type == GGML_TYPE_F32); + ggml_set_name(loraA, "loraA"); - if (tensor.type == GGML_TYPE_F32) { - f32_data = (float *) tensor.data; - } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) { - throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type))); - } else { - llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); - f32_data = (float *) f32_conv_buf.addr; + ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + GGML_ASSERT(loraB->type == GGML_TYPE_F32); + ggml_set_name(loraB, "loraB"); + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + return 1; } - printf("quantizing to %s .. ", ggml_type_name(new_type)); - fflush(stdout); + // w = w + BA*s + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + offload_func(BA); + ggml_set_name(BA, "BA"); + + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + ggml_set_name(scale_tensor, "scale_tensor"); - work.resize(nelements * 4); // upper bound on size - new_data = work.addr; - std::vector hist_cur(1 << 4, 0); + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + offload_func(BA); + ggml_set_name(BA, "BA_scaled"); + } - int chunk_size = 32 * 512; - const int nchunk = (nelements + chunk_size - 1)/chunk_size; - const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; - if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); - } else { - size_t counter = 0; - new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { - std::vector local_hist; - size_t local_size = 0; - while (true) { - std::unique_lock lock(mutex); - size_t first = counter; counter += chunk_size; - if (first >= nelements) { - if (!local_hist.empty()) { - for (int j=0; j %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); - int64_t tot_count = 0; - for (size_t i = 0; i < hist_cur.size(); i++) { - hist_all[i] += hist_cur[i]; - tot_count += hist_cur[i]; + r = ggml_cpy(lora_ctx, r, dest_t); + offload_func(r); + ggml_set_name(r, "r_cpy"); } - if (tot_count > 0) { - for (size_t i = 0; i < hist_cur.size(); i++) { - printf("%5.3f ", hist_cur[i] / float(nelements)); - } + struct ggml_cgraph gf = ggml_build_forward(r); + + ggml_graph_compute_helper(work_buffer, &gf, n_threads); + + // we won't need these tensors again, reset the context to save memory + ggml_free(lora_ctx); + lora_ctx = ggml_init(params); + lora_tensors.clear(); + + n_tensors++; + if (n_tensors % 4 == 0) { + LLAMA_LOG_INFO("."); } - printf("\n"); } - total_size_org += tensor.size; - total_size_new += new_size; - file_saver.write_tensor(tensor, new_type, new_data, new_size); } - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + // TODO: this should be in a destructor, it will leak on failure + ggml_free(lora_ctx); + if (base_ctx) { + ggml_free(base_ctx); + } + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); + + return 0; +} + +// +// interface implementation +// + +struct llama_context_params llama_context_default_params() { + struct llama_context_params result = { + /*.seed =*/ LLAMA_DEFAULT_SEED, + /*.n_ctx =*/ 512, + /*.n_batch =*/ 512, + /*.n_gpu_layers =*/ 0, + /*.main_gpu =*/ 0, + /*.tensor_split =*/ nullptr, + /*.rope_freq_base =*/ 0.0f, + /*.rope_freq_scale =*/ 0.0f, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, + /*.low_vram =*/ false, + /*.mul_mat_q =*/ true, + /*.f16_kv =*/ true, + /*.logits_all =*/ false, + /*.vocab_only =*/ false, + /*.use_mmap =*/ true, + /*.use_mlock =*/ false, + /*.embedding =*/ false, + }; + +#ifdef GGML_USE_METAL + result.n_gpu_layers = 1; +#endif + + return result; +} + +struct llama_model_quantize_params llama_model_quantize_default_params() { + struct llama_model_quantize_params result = { + /*.nthread =*/ 0, + /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, + /*.allow_requantize =*/ false, + /*.quantize_output_tensor =*/ true, + /*.only_copy =*/ false, + }; + + return result; +} + +int llama_max_devices(void) { + return LLAMA_MAX_DEVICES; +} + +bool llama_mmap_supported(void) { + return llama_mmap::SUPPORTED; +} + +bool llama_mlock_supported(void) { + return llama_mlock::SUPPORTED; +} + +void llama_backend_init(bool numa) { + ggml_time_init(); + // needed to initialize f16 tables { - int64_t sum_all = 0; - for (size_t i = 0; i < hist_all.size(); i++) { - sum_all += hist_all[i]; - } + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } - if (sum_all > 0) { - printf("%s: hist: ", __func__); - for (size_t i = 0; i < hist_all.size(); i++) { - printf("%5.3f ", hist_all[i] / float(sum_all)); - } - printf("\n"); - } + if (numa) { + ggml_numa_init(); } -} +#ifdef GGML_USE_MPI + ggml_mpi_backend_init(); +#endif +} +void llama_backend_free(void) { +#ifdef GGML_USE_MPI + ggml_mpi_backend_free(); +#endif +} -// -// interface implementation -// +int64_t llama_time_us(void) { + return ggml_time_us(); +} struct llama_model * llama_load_model_from_file( const char * path_model, @@ -3209,12 +6281,28 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram, - memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, - params.progress_callback_user_data)) { + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + *cur_percentage_p = percentage; + LLAMA_LOG_INFO("."); + if (percentage >= 100) { + LLAMA_LOG_INFO("\n"); + } + } + }; + } + + if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers, + params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, + params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, + params.progress_callback, params.progress_callback_user_data)) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; - fprintf(stderr, "%s: failed to load model\n", __func__); return nullptr; } @@ -3239,25 +6327,6 @@ struct llama_context * llama_new_context_with_model( params.seed = time(NULL); } - size_t blasbatchmul = get_blas_batch_mul(params.n_batch); - - unsigned cur_percentage = 0; - if (params.progress_callback == NULL) { - params.progress_callback_user_data = &cur_percentage; - params.progress_callback = [](float progress, void * ctx) { - unsigned * cur_percentage_p = (unsigned *) ctx; - unsigned percentage = (unsigned) (100 * progress); - while (percentage > *cur_percentage_p) { - *cur_percentage_p = percentage; - fprintf(stderr, "."); - fflush(stderr); - if (percentage >= 100) { - fprintf(stderr, "\n"); - } - } - }; - } - ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; @@ -3265,15 +6334,15 @@ struct llama_context * llama_new_context_with_model( // reserve memory for context buffers if (!params.vocab_only) { - if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { - fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { + LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; } { const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); - fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } const auto & hparams = ctx->model.hparams; @@ -3289,7 +6358,6 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } -#ifdef LLAMA_USE_ALLOCATOR { static const size_t tensor_alignment = 32; // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data @@ -3301,405 +6369,193 @@ struct llama_context * llama_new_context_with_model( // build worst-case graph int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); int n_past = hparams.n_ctx - n_tokens; - llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); - +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + ctx->ctx_metal = ggml_metal_init(1); + if (!ctx->ctx_metal) { + LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); + llama_free(ctx); + return NULL; + } + ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); + ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); + } +#endif // measure memory requirements for the graph size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); - - // debug - for comparison with scratch buffer - //size_t prev_req = - // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + - // MEM_REQ_SCRATCH1().at(ctx->model.type) + - // MEM_REQ_EVAL().at(ctx->model.type); - //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); // recreate allocator with exact memory requirements ggml_allocr_free(ctx->alloc); ctx->buf_alloc.resize(alloc_size); - ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); - } -#else - ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); -#endif - -#ifdef LLAMA_USE_SCRATCH - ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); - ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type)); -#endif - } - + ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment); #ifdef GGML_USE_METAL - if (params.n_gpu_layers > 0) { - // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(1); - - void * data_ptr = NULL; - size_t data_size = 0; - - if (params.use_mmap) { - data_ptr = ctx->model.mapping->addr; - data_size = ctx->model.mapping->size; - } else { - data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - data_size = ggml_get_mem_size (ctx->model.ctx); - } - - const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - - fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); - -#define LLAMA_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - fprintf(stderr, "%s: failed to add buffer\n", __func__); \ - llama_free(ctx); \ - return NULL; \ - } - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); -#undef LLAMA_METAL_CHECK_BUF - } -#endif - -#ifdef GGML_USE_MPI - ctx->ctx_mpi = ggml_mpi_init(); - - if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { - // Enter a blocking eval loop with dummy input, letting rank=0 drive the process - const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos()); - while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; - llama_backend_free(); - exit(1); - } + if (ctx->ctx_metal) { + ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); + } +#endif +#ifdef GGML_USE_CUBLAS + if (params.low_vram) { + LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); + ggml_cuda_set_scratch_size(0); // disable scratch + } else { + ggml_cuda_set_scratch_size(alloc_size); + LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + } #endif - - return ctx; -} - -struct llama_context * llama_init_from_file( - const char * path_model, - struct llama_context_params params) { - - struct llama_model * model = llama_load_model_from_file(path_model, params); - if (!model) { - return nullptr; - } - struct llama_context * ctx = llama_new_context_with_model(model, params); - ctx->model_owner = true; - return ctx; -} - -void llama_free(struct llama_context * ctx) { - delete ctx; -} - -int llama_model_quantize( - const char * fname_inp, - const char * fname_out, - const llama_model_quantize_params *params) { - try { - llama_model_quantize_internal(fname_inp, fname_out, params); - return 0; - } catch (const std::exception & err) { - fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what()); - return 1; - } -} - -int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { - fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - - const int64_t t_start_lora_us = ggml_time_us(); - - auto fin = std::ifstream(path_lora, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora); - return 1; - } - - // verify magic and version - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - fprintf(stderr, "%s: bad file magic\n", __func__); - return 1; - } - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); - - if (format_version != 1) { - fprintf(stderr, "%s: unsupported file version\n", __func__ ); - return 1; - } - } - - int32_t lora_r; - int32_t lora_alpha; - fin.read((char *) &lora_r, sizeof(lora_r)); - fin.read((char *) &lora_alpha, sizeof(lora_alpha)); - float scaling = (float)lora_alpha / (float)lora_r; - - fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - - - // create a temporary ggml context to store the lora tensors - // todo: calculate size from biggest possible tensor - std::vector lora_buf(1024ull * 1024ull * 1024ull); - struct ggml_init_params params; - params.mem_size = lora_buf.size(); - params.mem_buffer = lora_buf.data(); - params.no_alloc = false; - - ggml_context * lora_ctx = ggml_init(params); - std::unordered_map lora_tensors; - - // create a name -> tensor map of the model to accelerate lookups - std::unordered_map model_tensors; - for (const auto & kv: model.tensors_by_name) { - model_tensors.insert(kv); - } - - - // load base model - std::unique_ptr model_loader; - ggml_context * base_ctx = NULL; - llama_buffer base_buf; - if (path_base_model) { - fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); - model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true)); - - size_t ctx_size; - size_t mmapped_size; - model_loader->calc_sizes(&ctx_size, &mmapped_size); - base_buf.resize(ctx_size); - - ggml_init_params base_params; - base_params.mem_size = base_buf.size; - base_params.mem_buffer = base_buf.addr; - base_params.no_alloc = model_loader->use_mmap; - - base_ctx = ggml_init(base_params); - - model_loader->ggml_ctx = base_ctx; - - // maybe this should in llama_model_loader - if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa())); - } - } - - // read tensors and apply - bool warned = false; - int n_tensors = 0; - - std::vector work_buffer; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - if (fin.eof()) { - break; - } - - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - } - - std::string name; - { - char buf[1024]; - fin.read(buf, length); - name = std::string(buf, length); - } - - // check for lora suffix and get the type of tensor - const std::string lora_suffix = ".lora"; - size_t pos = name.rfind(lora_suffix); - if (pos == std::string::npos) { - fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); - return 1; - } - - std::string lora_type = name.substr(pos + lora_suffix.length()); - std::string base_name = name; - base_name.erase(pos); - // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); - - if (model_tensors.find(base_name) == model_tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); - return 1; - } - - // create ggml tensor - ggml_type wtype; - switch (ftype) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - default: - { - fprintf(stderr, "%s: invalid tensor data type '%d'\n", - __func__, ftype); - return false; - } - } - ggml_tensor * lora_tensor; - if (n_dims == 2) { - lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); - } - else { - fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; } - ggml_set_name(lora_tensor, "lora_tensor"); - // load tensor data - size_t offset = fin.tellg(); - size_t tensor_data_size = ggml_nbytes(lora_tensor); - offset = (offset + 31) & -32; - fin.seekg(offset); - fin.read((char*)lora_tensor->data, tensor_data_size); +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + // this allocates all Metal resources and memory buffers - lora_tensors[name] = lora_tensor; + void * data_ptr = NULL; + size_t data_size = 0; - // check if we have both A and B tensors and apply - if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && - lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + if (params.use_mmap) { + data_ptr = ctx->model.mapping->addr; + data_size = ctx->model.mapping->size; + } else { + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size = ggml_get_mem_size (ctx->model.ctx); + } - ggml_tensor * dest_t = model_tensors[base_name]; + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - offload_func_t offload_func = llama_nop; - offload_func_t offload_func_force_inplace = llama_nop; + LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); -#ifdef GGML_USE_CUBLAS - if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { - if (dest_t->type != GGML_TYPE_F16) { - throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); - } - offload_func = ggml_cuda_assign_buffers; - offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; +#define LLAMA_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ + llama_free(ctx); \ + return NULL; \ } -#endif // GGML_USE_CUBLAS - ggml_tensor * base_t; - if (model_loader) { - // load from base model - if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { - fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; - } - size_t idx = model_loader->tensors_map.name_to_idx[base_name]; - llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; - base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); - lt.data = (uint8_t *) lt.ggml_tensor->data; - model_loader->load_data_for(lt); - lt.ggml_tensor->data = lt.data; - } - else { - base_t = dest_t; - } + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - if (ggml_is_quantized(base_t->type)) { - if (!warned) { - fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; - } - } + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); - ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; - GGML_ASSERT(loraA->type == GGML_TYPE_F32); - ggml_set_name(loraA, "loraA"); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); +#undef LLAMA_METAL_CHECK_BUF + } +#endif + } - ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; - GGML_ASSERT(loraB->type == GGML_TYPE_F32); - ggml_set_name(loraB, "loraB"); +#ifdef GGML_USE_MPI + ctx->ctx_mpi = ggml_mpi_init(); - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); - return 1; - } + if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { + // Enter a blocking eval loop with dummy input, letting rank=0 drive the process + const std::vector tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx)); + while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; + llama_backend_free(); + exit(1); + } +#endif - // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); - offload_func(BA); - ggml_set_name(BA, "BA"); + return ctx; +} - if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); - ggml_set_name(scale_tensor, "scale_tensor"); +static struct llama_context * llama_init_from_file( + const char * path_model, + struct llama_context_params params) { + struct llama_model * model = llama_load_model_from_file(path_model, params); + if (!model) { + return nullptr; + } - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); - offload_func(BA); - ggml_set_name(BA, "BA_scaled"); - } + struct llama_context * ctx = llama_new_context_with_model(model, params); + ctx->model_owner = true; - ggml_tensor * r; - if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx, dest_t, BA); - offload_func_force_inplace(r); - ggml_set_name(r, "r_add_inplace"); - } - else { - r = ggml_add(lora_ctx, base_t, BA); - offload_func(r); - ggml_set_name(r, "r_add"); + return ctx; +} - r = ggml_cpy(lora_ctx, r, dest_t); - offload_func(r); - ggml_set_name(r, "r_cpy"); - } +void llama_free(struct llama_context * ctx) { + delete ctx; +} - struct ggml_cgraph gf = ggml_build_forward(r); +int llama_n_vocab(const struct llama_context * ctx) { + return llama_model_n_vocab(&ctx->model); +} - ggml_graph_compute_helper(work_buffer, &gf, n_threads); +int llama_n_ctx(const struct llama_context * ctx) { + return llama_model_n_ctx(&ctx->model); +} - // we won't need these tensors again, reset the context to save memory - ggml_free(lora_ctx); - lora_ctx = ggml_init(params); - lora_tensors.clear(); +int llama_n_ctx_train(const struct llama_context * ctx) { + return llama_model_n_ctx_train(&ctx->model); +} - n_tensors++; - if (n_tensors % 4 == 0) { - fprintf(stderr, "."); - } - } - } +int llama_n_embd(const struct llama_context * ctx) { + return llama_model_n_embd(&ctx->model); +} - // TODO: this should be in a destructor, it will leak on failure - ggml_free(lora_ctx); - if (base_ctx) { - ggml_free(base_ctx); +enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) { + return ctx->model.vocab.type; +} + +int llama_model_n_vocab(const struct llama_model * model) { + return model->vocab.id_to_token.size(); +} + +int llama_model_n_ctx(const struct llama_model * model) { + return model->hparams.n_ctx; +} + +int llama_model_n_ctx_train(const struct llama_model * model) { + return model->hparams.n_ctx_train; +} + +int llama_model_n_embd(const struct llama_model * model) { + return model->hparams.n_embd; +} + +int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { + return snprintf(buf, buf_size, "%s %s %s", + model->name.c_str(), + llama_model_type_name(model->type), + llama_model_ftype_name(model->ftype).c_str()); +} + +uint64_t llama_model_size(const struct llama_model * model) { + uint64_t size = 0; + for (const auto & it : model->tensors_by_name) { + size += ggml_nbytes(it.second); } + return size; +} - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; - fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); +uint64_t llama_model_n_params(const struct llama_model * model) { + uint64_t nparams = 0; + for (const auto & it : model->tensors_by_name) { + nparams += ggml_nelements(it.second); + } + return nparams; +} - return 0; +int llama_model_quantize( + const char * fname_inp, + const char * fname_out, + const llama_model_quantize_params * params) { + try { + llama_model_quantize_internal(fname_inp, fname_out, params); + return 0; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); + return 1; + } } int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); } catch (const std::exception & err) { - fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; } } @@ -3708,7 +6564,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha try { return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); } catch (const std::exception & err) { - fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; } } @@ -3757,6 +6613,46 @@ size_t llama_get_state_size(const struct llama_context * ctx) { return s_total; } +// llama_context_data +struct llama_data_context { + virtual void write(const void * src, size_t size) = 0; + virtual size_t get_size_written() = 0; + virtual ~llama_data_context() = default; +}; + +struct llama_data_buffer_context : llama_data_context { + uint8_t * ptr; + size_t size_written = 0; + + llama_data_buffer_context(uint8_t * p) : ptr(p) {} + + void write(const void * src, size_t size) override { + memcpy(ptr, src, size); + ptr += size; + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_data_file_context : llama_data_context { + llama_file * file; + size_t size_written = 0; + + llama_data_file_context(llama_file * f) : file(f) {} + + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + /** copy state data into either a buffer or file depending on the passed in context * * file context: @@ -3770,7 +6666,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { * llama_copy_state_data(ctx, &data_ctx); * */ -void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { +static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { // copy rng { std::stringstream rng_ss; @@ -3890,7 +6786,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss >> ctx->rng; - LLAMA_ASSERT(rng_ss.fail() == false); + GGML_ASSERT(!rng_ss.fail()); } // set logits @@ -3901,7 +6797,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - LLAMA_ASSERT(ctx->logits.capacity() == logits_cap); + GGML_ASSERT(ctx->logits.capacity() == logits_cap); if (logits_size) { ctx->logits.resize(logits_size); @@ -3917,7 +6813,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); - LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size); + GGML_ASSERT(ctx->embedding.capacity() == embedding_size); if (embedding_size) { memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); @@ -3940,7 +6836,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); if (kv_size) { - LLAMA_ASSERT(kv_self.buf.size == kv_size); + GGML_ASSERT(kv_self.buf.size == kv_size); const size_t elt_size = ggml_element_size(kv_self.k); @@ -3976,7 +6872,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const size_t nread = inp - src; const size_t max_size = llama_get_state_size(ctx); - LLAMA_ASSERT(nread <= max_size); + GGML_ASSERT(nread <= max_size); return nread; } @@ -3990,7 +6886,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c const uint32_t version = file.read_u32(); if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { - fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); return false; } @@ -3998,7 +6894,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c file.read_raw(&session_hparams, sizeof(llama_hparams)); if (session_hparams != ctx->model.hparams) { - fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); + LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__); return false; } } @@ -4008,7 +6904,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c const uint32_t n_token_count = file.read_u32(); if (n_token_count > n_token_capacity) { - fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); return false; } @@ -4022,7 +6918,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c const size_t n_state_size_max = llama_get_state_size(ctx); if (n_state_size_cur > n_state_size_max) { - fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); + LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); return false; } @@ -4039,7 +6935,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi try { return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { - fprintf(stderr, "error loading session file: %s\n", err.what()); + LLAMA_LOG_ERROR("error loading session file: %s\n", err.what()); return false; } } @@ -4070,7 +6966,7 @@ int llama_eval( int n_past, int n_threads) { if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) { - fprintf(stderr, "%s: failed to eval\n", __func__); + LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); return 1; } @@ -4084,7 +6980,6 @@ int llama_eval( return 0; } - int llama_eval_embd( struct llama_context * ctx, const float * embd, @@ -4092,7 +6987,7 @@ int llama_eval_embd( int n_past, int n_threads) { if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) { - fprintf(stderr, "%s: failed to eval\n", __func__); + LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); return 1; } @@ -4110,120 +7005,115 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { const int n_batch = 1; const int n_ctx = 512 - n_batch; - const std::vector tmp(n_batch, llama_token_bos()); + const std::vector tmp(n_batch, llama_token_bos(ctx)); if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) { - fprintf(stderr, "%s: failed to eval\n", __func__); + LLAMA_LOG_ERROR("%s: failed to eval\n", __func__); return 1; } return 0; } -int llama_tokenize_with_model( - const struct llama_model * model, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - auto res = llama_tokenize(model->vocab, text, add_bos); - - if (n_max_tokens < (int) res.size()) { - fprintf(stderr, "%s: too many tokens\n", __func__); - return -((int) res.size()); - } - - for (size_t i = 0; i < res.size(); i++) { - tokens[i] = res[i]; - } - - return res.size(); -} - -int llama_tokenize( - struct llama_context * ctx, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); -} - -int llama_n_vocab_from_model(const struct llama_model * model) { - return model->vocab.id_to_token.size(); +float * llama_get_logits(struct llama_context * ctx) { + return ctx->logits.data(); } -int llama_n_ctx_from_model(const struct llama_model * model) { - return model->hparams.n_ctx; +float * llama_get_embeddings(struct llama_context * ctx) { + return ctx->embedding.data(); } -int llama_n_embd_from_model(const struct llama_model * model) { - return model->hparams.n_embd; +const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) { + return ctx->model.vocab.id_to_token[token].text.c_str(); } -int llama_n_vocab(const struct llama_context * ctx) { - return ctx->model.vocab.id_to_token.size(); +float llama_token_get_score(const struct llama_context * ctx, llama_token token) { + return ctx->model.vocab.id_to_token[token].score; } -int llama_n_ctx(const struct llama_context * ctx) { - return ctx->model.hparams.n_ctx; +llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) { + return ctx->model.vocab.id_to_token[token].type; } -int llama_n_embd(const struct llama_context * ctx) { - return ctx->model.hparams.n_embd; +llama_token llama_token_bos(const struct llama_context * ctx) { + return ctx->model.vocab.special_bos_id; } -int llama_get_vocab_from_model( - const struct llama_model * model, - const char * * strings, - float * scores, - int capacity) { - int n = std::min(capacity, (int) model->vocab.id_to_token.size()); - for (int i = 0; ivocab.id_to_token[i].tok.c_str(); - scores[i] = model->vocab.id_to_token[i].score; - } - return n; +llama_token llama_token_eos(const struct llama_context * ctx) { + return ctx->model.vocab.special_eos_id; } -int llama_get_vocab( - const struct llama_context * ctx, - const char * * strings, - float * scores, - int capacity) { - return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity); +llama_token llama_token_nl(const struct llama_context * ctx) { + return ctx->model.vocab.linefeed_id; } -float * llama_get_logits(struct llama_context * ctx) { - return ctx->logits.data(); +int llama_tokenize( + struct llama_context * ctx, + const char * text, + int text_len, + llama_token * tokens, + int n_max_tokens, + bool add_bos) { + return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos); } -float * llama_get_embeddings(struct llama_context * ctx) { - return ctx->embedding.data(); -} +int llama_tokenize_with_model( + const struct llama_model * model, + const char * text, + int text_len, + llama_token * tokens, + int n_max_tokens, + bool add_bos) { + auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos); -const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) { - if (token >= llama_n_vocab_from_model(model)) { - return nullptr; + if (n_max_tokens < (int) res.size()) { + // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); + return -((int) res.size()); } - return model->vocab.id_to_token[token].tok.c_str(); -} - -const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { - return llama_token_to_str_with_model(&ctx->model, token); -} + for (size_t i = 0; i < res.size(); i++) { + tokens[i] = res[i]; + } -llama_token llama_token_bos() { - return 1; + return res.size(); } -llama_token llama_token_eos() { - return 2; +int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) { + return llama_token_to_piece_with_model(&ctx->model, token, buf, length); } -llama_token llama_token_nl() { - return 13; +// does not write null-terminator to buf +int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { + if (0 <= token && token < llama_model_n_vocab(model)) { + if (llama_is_normal_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; + if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) { + llama_unescape_whitespace(result); + } + if (length < (int) result.length()) { + return -result.length(); + } + memcpy(buf, result.c_str(), result.length()); + return result.length(); + } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT + if (length < 3) { + return -3; + } + buf[0] = '\xe2'; + buf[1] = '\x96'; + buf[2] = '\x85'; + return 3; + } else if (llama_is_control_token(model->vocab, token)) { + ; + } else if (llama_is_byte_token(model->vocab, token)) { + if (length < 1) { + return -1; + } + buf[0] = llama_token_to_byte(model->vocab, token); + return 1; + } + } + return 0; } struct llama_timings llama_get_timings(struct llama_context * ctx) { @@ -4246,15 +7136,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) { void llama_print_timings(struct llama_context * ctx) { const llama_timings timings = llama_get_timings(ctx); - fprintf(stderr, "\n"); - fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); - fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("\n"); + LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); + LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); - fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); - fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); + LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); } void llama_reset_timings(struct llama_context * ctx) { @@ -4281,12 +7171,79 @@ const char * llama_print_system_info(void) { s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; return s.c_str(); } +void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) { + fprintf(stream, "\n"); + fprintf(stream, "###########\n"); + fprintf(stream, "# Timings #\n"); + fprintf(stream, "###########\n"); + fprintf(stream, "\n"); + + fprintf(stream, "mst_eval: %.2f # ms / token during generation\n", + 1.0e-3 * ctx->t_eval_us / ctx->n_eval); + fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n", + 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval); + fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n", + 1.0e-3 * ctx->t_sample_us / ctx->n_sample); + fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval); + fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval); + fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample); + fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us); + fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us); + fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us); + fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us); + fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n", + 1.0e6 * ctx->n_eval / ctx->t_eval_us); + fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n", + 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us); + fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n", + 1.0e6 * ctx->n_sample / ctx->t_sample_us); +} + // For internal test use -const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { +const std::vector> & llama_internal_get_tensor_map( + struct llama_context * ctx +) { return ctx->model.tensors_by_name; } + +void llama_log_set(llama_log_callback log_callback, void * user_data) { + g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; + g_state.log_callback_user_data = user_data; +} + +static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + g_state.log_callback(level, buffer, g_state.log_callback_user_data); + } else { + char* buffer2 = new char[len+1]; + vsnprintf(buffer2, len+1, format, args_copy); + buffer2[len] = 0; + g_state.log_callback(level, buffer2, g_state.log_callback_user_data); + delete[] buffer2; + } + va_end(args_copy); +} + +static void llama_log_internal(llama_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + llama_log_internal_v(level, format, args); + va_end(args); +} + +static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} diff --git a/llama.h b/llama.h index fa1977f2d949241834f3936424174c2133ef3ac0..369be048c001276bcf8d3cea6ca0c5dcebd79a2c 100644 --- a/llama.h +++ b/llama.h @@ -10,6 +10,7 @@ #endif // GGML_USE_CUBLAS #include #include +#include #include #ifdef LLAMA_SHARED @@ -34,29 +35,18 @@ # define DEPRECATED(func, hint) func #endif -#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' -#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' -#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' -#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' -#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +#define LLAMA_DEFAULT_SEED 0xFFFFFFFF -#define LLAMA_FILE_VERSION 3 -#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT -#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML -#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 1 +#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' -#define LLAMA_DEFAULT_SEED 0xFFFFFFFF +#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN +#define LLAMA_SESSION_VERSION 1 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif -#ifndef LLAMA_DEFAULT_RMS_EPS -#define LLAMA_DEFAULT_RMS_EPS 5e-6f -#endif - #ifdef __cplusplus extern "C" { #endif @@ -72,6 +62,52 @@ extern "C" { typedef int llama_token; + enum llama_log_level { + LLAMA_LOG_LEVEL_ERROR = 2, + LLAMA_LOG_LEVEL_WARN = 3, + LLAMA_LOG_LEVEL_INFO = 4 + }; + + enum llama_vocab_type { + LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece + LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding + }; + + enum llama_token_type { + LLAMA_TOKEN_TYPE_UNDEFINED = 0, + LLAMA_TOKEN_TYPE_NORMAL = 1, + LLAMA_TOKEN_TYPE_UNKNOWN = 2, + LLAMA_TOKEN_TYPE_CONTROL = 3, + LLAMA_TOKEN_TYPE_USER_DEFINED = 4, + LLAMA_TOKEN_TYPE_UNUSED = 5, + LLAMA_TOKEN_TYPE_BYTE = 6, + }; + + // model file types + enum llama_ftype { + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed + // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors + + LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -86,12 +122,10 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { + struct llama_context_params { uint32_t seed; // RNG seed, -1 for random int32_t n_ctx; // text context int32_t n_batch; // prompt processing batch size - int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) - float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams) int32_t n_gpu_layers; // number of layers to store in VRAM int32_t main_gpu; // the GPU that is used for scratch and small tensors @@ -116,35 +150,21 @@ extern "C" { bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only }; - // model file types - enum llama_ftype { - LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed - // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors - }; + + // Signature for logging events + // Note that text includes the new line character at the end for most events. + // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it + // if it exists. + // It might not exist for progress report where '.' is output repeatedly. + typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data); // model quantization parameters typedef struct llama_model_quantize_params { int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype + enum llama_ftype ftype; // quantize to this llama_ftype bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored } llama_model_quantize_params; // grammar types @@ -195,23 +215,16 @@ extern "C" { int32_t n_eval; }; - LLAMA_API int llama_max_devices(); - - LLAMA_API struct llama_context_params llama_context_default_params(); - LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); + LLAMA_API struct llama_context_params llama_context_default_params(void); + LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); - LLAMA_API bool llama_mmap_supported(); - LLAMA_API bool llama_mlock_supported(); - - // TODO: not great API - very likely to change // Initialize the llama + ggml backend // If numa is true, use NUMA optimizations // Call once at the start of the program LLAMA_API void llama_backend_init(bool numa); - // Call once at the end of the program - currently only used for MPI - LLAMA_API void llama_backend_free(); - LLAMA_API int64_t llama_time_us(); + // Call once at the end of the program - currently only used for MPI + LLAMA_API void llama_backend_free(void); LLAMA_API struct llama_model * llama_load_model_from_file( const char * path_model, @@ -223,17 +236,34 @@ extern "C" { struct llama_model * model, struct llama_context_params params); - // Various functions for loading a ggml llama model. - // Allocate (almost) all memory needed for the model. - // Return NULL on failure - LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file( - const char * path_model, - struct llama_context_params params), - "please use llama_load_model_from_file combined with llama_new_context_with_model instead"); - // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); + LLAMA_API int64_t llama_time_us(void); + + LLAMA_API int llama_max_devices (void); + LLAMA_API bool llama_mmap_supported (void); + LLAMA_API bool llama_mlock_supported(void); + + LLAMA_API int llama_n_vocab (const struct llama_context * ctx); + LLAMA_API int llama_n_ctx (const struct llama_context * ctx); + LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx); + LLAMA_API int llama_n_embd (const struct llama_context * ctx); + + LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx); + + LLAMA_API int llama_model_n_vocab (const struct llama_model * model); + LLAMA_API int llama_model_n_ctx (const struct llama_model * model); + LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model); + LLAMA_API int llama_model_n_embd (const struct llama_model * model); + + // Get a string describing the model type + LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); + // Returns the total size of all the tensors in the model in bytes + LLAMA_API uint64_t llama_model_size(const struct llama_model * model); + // Returns the total number of parameters in the model + LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); + // Returns 0 on success LLAMA_API int llama_model_quantize( const char * fname_inp, @@ -255,9 +285,9 @@ extern "C" { LLAMA_API int llama_model_apply_lora_from_file( const struct llama_model * model, - const char * path_lora, - const char * path_base_model, - int n_threads); + const char * path_lora, + const char * path_base_model, + int n_threads); // Returns the number of tokens in the KV cache LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); @@ -307,14 +337,44 @@ extern "C" { // IMPORTANT: do not use for anything else other than debugging and testing! LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname); + // Token logits obtained from the last call to llama_eval() + // The logits for the last token are stored in the last row + // Can be mutated in order to change the probabilities of the next token + // Rows: n_tokens + // Cols: n_vocab + LLAMA_API float * llama_get_logits(struct llama_context * ctx); + + // Get the embeddings for the input + // shape: [n_embd] (1-dimensional) + LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); + + // + // Vocab + // + + LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token); + + LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token); + + LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token); + + // Special tokens + LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence + LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence + LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line + + // + // Tokenization + // + // Convert the provided text into tokens. // The tokens pointer must be large enough to hold the resulting tokens. // Returns the number of tokens on success, no more than n_max_tokens // Returns a negative number on failure - the number of tokens that would have been returned - // TODO: not sure if correct LLAMA_API int llama_tokenize( struct llama_context * ctx, const char * text, + int text_len, llama_token * tokens, int n_max_tokens, bool add_bos); @@ -322,59 +382,31 @@ extern "C" { LLAMA_API int llama_tokenize_with_model( const struct llama_model * model, const char * text, + int text_len, llama_token * tokens, int n_max_tokens, bool add_bos); - LLAMA_API int llama_n_vocab(const struct llama_context * ctx); - LLAMA_API int llama_n_ctx (const struct llama_context * ctx); - LLAMA_API int llama_n_embd (const struct llama_context * ctx); - - LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); - LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); - LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); - - // Get the vocabulary as output parameters. - // Returns number of results. - LLAMA_API int llama_get_vocab( - const struct llama_context * ctx, - const char * * strings, - float * scores, - int capacity); - - LLAMA_API int llama_get_vocab_from_model( - const struct llama_model * model, - const char * * strings, - float * scores, - int capacity); - - // Token logits obtained from the last call to llama_eval() - // The logits for the last token are stored in the last row - // Can be mutated in order to change the probabilities of the next token - // Rows: n_tokens - // Cols: n_vocab - LLAMA_API float * llama_get_logits(struct llama_context * ctx); - - // Get the embeddings for the input - // shape: [n_embd] (1-dimensional) - LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); - - // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str( + // Token Id -> Piece. + // Uses the vocabulary in the provided context. + // Does not write null terminator to the buffer. + // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. + LLAMA_API int llama_token_to_piece( const struct llama_context * ctx, - llama_token token); + llama_token token, + char * buf, + int length); - LLAMA_API const char * llama_token_to_str_with_model( + LLAMA_API int llama_token_to_piece_with_model( const struct llama_model * model, - llama_token token); - - // Special tokens - LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence - LLAMA_API llama_token llama_token_eos(); // end-of-sentence - LLAMA_API llama_token llama_token_nl(); // next-line + llama_token token, + char * buf, + int length); + // // Grammar // + LLAMA_API struct llama_grammar * llama_grammar_init( const llama_grammar_element ** rules, size_t n_rules, @@ -382,7 +414,11 @@ extern "C" { LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); + LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar); + + // // Sampling functions + // /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); @@ -443,6 +479,43 @@ extern "C" { /// @details Accepts the sampled token into the grammar LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token); + // + // Beam search + // + + struct llama_beam_view { + const llama_token * tokens; + size_t n_tokens; + float p; // Cumulative beam probability (renormalized relative to all beams) + bool eob; // Callback should set this to true when a beam is at end-of-beam. + }; + + // Passed to beam_search_callback function. + // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams + // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks. + // These pointers are valid only during the synchronous callback, so should not be saved. + struct llama_beams_state { + struct llama_beam_view * beam_views; + size_t n_beams; // Number of elements in beam_views[]. + size_t common_prefix_length; // Current max length of prefix tokens shared by all beams. + bool last_call; // True iff this is the last callback invocation. + }; + + // Type of pointer to the beam_search_callback function. + // void* callback_data is any custom data passed to llama_beam_search, that is subsequently + // passed back to beam_search_callback. This avoids having to use global variables in the callback. + typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state); + + /// @details Deterministically returns entire sentence constructed by a beam search. + /// @param ctx Pointer to the llama_context. + /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state. + /// @param callback_data A pointer that is simply passed back to callback. + /// @param n_beams Number of beams to use. + /// @param n_past Number of tokens already evaluated. + /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier. + /// @param n_threads Number of threads as passed to llama_eval(). + LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads); + // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx); @@ -451,6 +524,12 @@ extern "C" { // Print system information LLAMA_API const char * llama_print_system_info(void); + // Set callback for all future logging events. + // If this is not called, or NULL is supplied, everything is output on stderr. + LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + + LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); + #ifdef __cplusplus } #endif @@ -460,10 +539,13 @@ extern "C" { #include #include + struct ggml_tensor; -const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); +const std::vector> & llama_internal_get_tensor_map( + struct llama_context * ctx +); -#endif +#endif // LLAMA_API_INTERNAL #endif // LLAMA_H diff --git a/make_old_pyinstaller.bat b/make_old_pyinstaller.bat index c72a2ce10fa4267cd49124b6f5333a53065bd29f..6ae4297bcf77a90986a22351ecf64de00a5c1abd 100644 --- a/make_old_pyinstaller.bat +++ b/make_old_pyinstaller.bat @@ -1,4 +1,4 @@ echo This file is only for my own usage, please do not use it. I am lazy. set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH% -PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp_nocuda.exe" \ No newline at end of file +PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp_default.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp_nocuda.exe" \ No newline at end of file diff --git a/make_old_pyinstaller_cuda.bat b/make_old_pyinstaller_cuda.bat index dfe28457161498e37a7c13d8a399d15fda71cf54..518fb165c192db09517750335c6b2191b4e910f3 100644 --- a/make_old_pyinstaller_cuda.bat +++ b/make_old_pyinstaller_cuda.bat @@ -1,4 +1,4 @@ echo This file is only for my own usage, please do not use it. I am lazy. set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH% -PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./nikogreen.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublas64_11.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cudart64_110.dll;." --add-data "./msvcp140.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file +PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./nikogreen.ico" --add-data "./klite.embd;." --add-data "./koboldcpp_default.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublas64_11.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cudart64_110.dll;." --add-data "./msvcp140.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file diff --git a/make_pyinstaller.bat b/make_pyinstaller.bat index 3cf867b28c3feed10f1731c1ba6b5cda2a64284e..5ca9604192c2a958a0ea1b102c3a64351b842147 100644 --- a/make_pyinstaller.bat +++ b/make_pyinstaller.bat @@ -1 +1 @@ -PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file +PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp_default.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file diff --git a/make_pyinstaller.sh b/make_pyinstaller.sh index 270850663ee79a3f061823a4a43214fbae40b6d2..aeffb0fee2256b159079980a9c46da4773efae22 100644 --- a/make_pyinstaller.sh +++ b/make_pyinstaller.sh @@ -2,7 +2,7 @@ pyinstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" \ --add-data "./klite.embd:." \ ---add-data "./koboldcpp.so:." \ +--add-data "./koboldcpp_default.so:." \ --add-data "./koboldcpp_openblas.so:." \ --add-data "./koboldcpp_failsafe.so:." \ --add-data "./koboldcpp_noavx2.so:." \ diff --git a/make_pyinstaller_hybrid_henk.bat b/make_pyinstaller_hybrid_henk.bat index a99b1569dcdd7f952b8f3dc20fe1f19379e3602e..fa5a8e232a9e731f4b9413f990b255cdb6705c73 100644 --- a/make_pyinstaller_hybrid_henk.bat +++ b/make_pyinstaller_hybrid_henk.bat @@ -2,4 +2,4 @@ cd /d "%~dp0" copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cudart64_110.dll" .\ /Y copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublasLt64_11.dll" .\ /Y copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublas64_11.dll" .\ /Y -PyInstaller --noconfirm --onefile --collect-all customtkinter --clean --console --icon ".\niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cudart64_110.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cublas64_11.dll;." --add-data "./rwkv_vocab.embd;." --add-data "C:/Windows/System32/msvcp140.dll;." --add-data "C:/Windows/System32/vcruntime140_1.dll;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file +PyInstaller --noconfirm --onefile --collect-all customtkinter --clean --console --icon ".\niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp_default.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cudart64_110.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cublas64_11.dll;." --add-data "./rwkv_vocab.embd;." --add-data "C:/Windows/System32/msvcp140.dll;." --add-data "C:/Windows/System32/vcruntime140_1.dll;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file diff --git a/model_adapter.cpp b/model_adapter.cpp index 514764ec0f52492ae5b7b72bc53bd97c0f98d71b..5c3a3f95fb14e56230eda30d760e11ad78f47e5e 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -10,6 +10,7 @@ #include #include "model_adapter.h" +#include "ggml.h" #include @@ -79,7 +80,7 @@ void print_tok_vec(std::vector &embd) } //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) - FileFormat check_file_format(const std::string & fname) + FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta) { std::vector f_buf(1024*1024); @@ -251,7 +252,51 @@ void print_tok_vec(std::vector &embd) fileformat = FileFormat::GGJT_2; } } - fin.close(); + else if(magic == 0x46554747) + { + fin.close(); + fileformat = FileFormat::GGUF_LLAMA; + + struct gguf_init_params ggufparams; + ggufparams.no_alloc = true; + ggufparams.ctx = NULL; + + auto ctx = gguf_init_from_file(fname.c_str(), ggufparams); + + auto keyidx = gguf_find_key(ctx, "general.architecture"); + std::string modelarch = ""; + if (keyidx != -1) { modelarch = gguf_get_val_str(ctx, keyidx); } + + if(modelarch=="llama") + { + fileformat = FileFormat::GGUF_LLAMA; + } + else if(modelarch=="falcon") + { + fileformat = FileFormat::GGUF_FALCON; //uses the same loader + printf("\nDetected GGUF FALCON format.\n"); + } + else + { + printf("\nERROR: Detected unimplemented GGUF Arch: %s\n",modelarch.c_str()); + } + + if(modelarch!="" && fileformatmeta!=nullptr) + { + std::string fkey = modelarch+".context_length"; + auto keyidx = gguf_find_key(ctx, fkey.c_str()); + if (keyidx != -1) { + fileformatmeta->n_ctx_train = gguf_get_val_u32(ctx, keyidx); + } + } + gguf_free(ctx); + } + + if(fin.is_open()) + { + fin.close(); + } + return fileformat; } diff --git a/model_adapter.h b/model_adapter.h index 2b43f7e6fa3cd2e03da184cc95cbda8ed9d96ee7..fa596700a8df1188c5aa6b69b7da9ac1c1595fb2 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -21,6 +21,7 @@ enum FileFormat GGJT=3, // 3=(llama ggjt) GGJT_2=4, //newer llama format unshuffled GGJT_3=5, //using 16bit scalar + GGUF_LLAMA=6, //GGUF (llama newest ver) GPTJ_1=100, //the very first super old GPTJ format GPTJ_2=101, //pygmalion, uses old ggml lib @@ -45,6 +46,14 @@ enum FileFormat NEOX_7=406, //using 16bit scalar redpajama MPT_1=500, //first supported mpt version + + GGUF_FALCON=600, //GGUF (falcon) + +}; + +struct FileFormatExtraMeta +{ + int n_ctx_train = 2048; }; enum ModelLoadResult @@ -54,10 +63,11 @@ enum ModelLoadResult RETRY_LOAD = 2, //used if it's suspected that the model is an older format }; -ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format); +ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta file_format_meta); generation_outputs gpttype_generate(const generation_inputs inputs, generation_outputs &output); bool gpttype_generate_abort(); const std::string & gpttype_get_pending_output(); +int gpttype_token_count(const std::string & input); void timer_start(); double timer_check(); @@ -68,7 +78,7 @@ std::vector LongestCommonSubseq(const std::vector x, const std::vector bool ArrStartWith(const std::vector targetArray, const std::vector searchSeq); int ArrFindIndexOf(const std::vector targetArray, const std::vector searchSeq); -FileFormat check_file_format(const std::string & fname); +FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta); void ContextFastForward(std::vector ¤t_context_tokens, std::vector &embd_inp, int &n_past, std::vector &last_n_tokens, const int nctx, std::vector &smartcontext, const bool useSmartContext, const bool requireFullSubset); \ No newline at end of file diff --git a/models/.editorconfig b/models/.editorconfig new file mode 100644 index 0000000000000000000000000000000000000000..78b36ca0838fc0e4747c102a9b16154d7d481c2b --- /dev/null +++ b/models/.editorconfig @@ -0,0 +1 @@ +root = true diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama.gguf new file mode 100644 index 0000000000000000000000000000000000000000..63bfaf672f382c0f5bbcffe54736e2698ef3ac55 Binary files /dev/null and b/models/ggml-vocab-llama.gguf differ diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000000000000000000000000000000000000..55c168f2d7d1272f5c9e807deb10957e5051796b --- /dev/null +++ b/mypy.ini @@ -0,0 +1,5 @@ +[mypy] +strict = true +allow_untyped_calls = true +allow_untyped_defs = true +allow_incomplete_defs = true diff --git a/otherarch/ggml_v1.c b/otherarch/ggml_v1.c index 1c96d6dceb394b30383e8218231dc79c29c4060e..efb591e77d2dc6a70190aec425ad9b91d8a92c06 100644 --- a/otherarch/ggml_v1.c +++ b/otherarch/ggml_v1.c @@ -1,6 +1,3 @@ -// Defines CLOCK_MONOTONIC and asprintf on Linux -#define _GNU_SOURCE - #include "ggml_v1.h" #if defined(_MSC_VER) || defined(__MINGW32__) diff --git a/otherarch/ggml_v2-cuda-legacy.cu b/otherarch/ggml_v2-cuda-legacy.cu index d3220a786488ea7ef02163ea55576bd2b9d9c47d..e7053b764b670105be7033a8f467284775f242fc 100644 --- a/otherarch/ggml_v2-cuda-legacy.cu +++ b/otherarch/ggml_v2-cuda-legacy.cu @@ -4,9 +4,64 @@ #include #include +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#else #include #include #include +#endif #include "ggml_v2-cuda-legacy.h" #include "ggml_v2-cuda.h" diff --git a/otherarch/ggml_v2-cuda.cu b/otherarch/ggml_v2-cuda.cu index 8314adb25b49095a2ad3e8f8a4043c4bc2f1eddb..b4502df00c637041a593d63074ffea264f028544 100644 --- a/otherarch/ggml_v2-cuda.cu +++ b/otherarch/ggml_v2-cuda.cu @@ -4,10 +4,66 @@ #include #include +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#else #include #include #include +#endif + #include "ggml_v2-cuda.h" #include "ggml_v2.h" @@ -807,4 +863,4 @@ void ggml_v2_cuda_transform_tensor(ggml_v2_tensor * tensor) { tensor->data = d_Q; tensor->backend = GGML_V2_BACKEND_CUDA; -} \ No newline at end of file +} diff --git a/otherarch/ggml_v2-opencl.cpp b/otherarch/ggml_v2-opencl.cpp index 93f038ef54c600f20a0900a4b2f11d642349d86b..660d93baa16fe2fec427fe392a1b2398c9947d07 100644 --- a/otherarch/ggml_v2-opencl.cpp +++ b/otherarch/ggml_v2-opencl.cpp @@ -573,7 +573,7 @@ static void ggml_v2_cl_mul_mat_f32(const ggml_v2_tensor * src0, const ggml_v2_te &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { - printf("\nF32 Matmul Failed (%d): [dims: %lld,%lld,%lld,%lld] You may be out of VRAM. Please check if you have enough.\n",status,ne00,ne01,ne10,ne11); + printf("\nF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); GGML_V2_ASSERT(false); } @@ -672,7 +672,7 @@ static void ggml_v2_cl_mul_mat_f16(const ggml_v2_tensor * src0, const ggml_v2_te &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { - printf("\nF16 Matmul Failed (%d): [dims: %lld,%lld,%lld,%lld] You may be out of VRAM. Please check if you have enough.\n",status,ne00,ne01,ne10,ne11); + printf("\nF16 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); GGML_V2_ASSERT(false); } @@ -780,7 +780,7 @@ static void ggml_v2_cl_mul_mat_q_f32(const ggml_v2_tensor * src0, const ggml_v2_ &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { - printf("\nQF32 Matmul Failed (%d): [dims: %lld,%lld,%lld,%lld] You may be out of VRAM. Please check if you have enough.\n",status,ne00,ne01,ne10,ne11); + printf("\nQF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast(status),ne00,ne01,ne10,ne11); GGML_V2_ASSERT(false); } } diff --git a/otherarch/ggml_v2.c b/otherarch/ggml_v2.c index 6b18fe723a84681c2f9a1199666cb5a0db93fd1e..a3f593ee6fd2734ba81b39f20dc5b4c8b213fdec 100644 --- a/otherarch/ggml_v2.c +++ b/otherarch/ggml_v2.c @@ -1,6 +1,3 @@ -// Defines CLOCK_MONOTONIC on Linux -#define _GNU_SOURCE - #include "ggml_v2.h" #if defined(_MSC_VER) || defined(__MINGW32__) diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 9e023a9d6c1deab2e96aa346875fe63ffc6faeef..33ca85e11547aee09db61e8c6d75a8703a069b2d 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -150,7 +150,7 @@ ModelLoadResult gpt2_v2_model_load(const std::string & fname, gpt2_v2_model & mo params.mem_size = ctx_size; params.mem_buffer = NULL; params.no_alloc = false; - + model.ctx = ggml_v2_init(params); if (!model.ctx) { @@ -237,7 +237,7 @@ ModelLoadResult gpt2_v2_model_load(const std::string & fname, gpt2_v2_model & mo const int n_mem = n_layer*n_ctx; const int n_elements = n_embd*n_mem; - + model.memory_k = ggml_v2_new_tensor_1d(ctx, memory_type, n_elements*1.5); model.memory_v = ggml_v2_new_tensor_1d(ctx, memory_type, n_elements*1.5); @@ -287,7 +287,7 @@ ModelLoadResult gpt2_v2_model_load(const std::string & fname, gpt2_v2_model & mo } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n", + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%ld, %ld], expected [%d, %d]\n", __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); return ModelLoadResult::FAIL; } @@ -379,7 +379,7 @@ bool gpt2_v2_eval( params.mem_size = buf_size; params.mem_buffer = buf; params.no_alloc = false; - + struct ggml_v2_context * ctx0 = ggml_v2_init(params); struct ggml_v2_cgraph gf = {}; diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index dd178ca021760390ad1c4a24307f202d35afdbc5..97b23265f434bd93ca73a3c2655238d52794c6ea 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -359,7 +359,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g const auto & hparams = model.hparams; size_t vram_total = 0; const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); - fprintf(stderr, "%s: [GPU] offloading %d layers to GPU\n", __func__, n_gpu); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + #else + fprintf(stderr, "%s: [CUDA] offloading %d layers to GPU\n", __func__, n_gpu); + #endif for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; @@ -378,7 +382,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); #endif } - fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #else + fprintf(stderr, "%s: [CUDA] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #endif } #endif @@ -473,7 +481,7 @@ bool gpt2_eval( // norm { // [ 768, N] - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, default_norm_eps); // cur = ln_1_g*cur + ln_1_b // [ 768, N] @@ -624,7 +632,7 @@ bool gpt2_eval( { // norm { - cur = ggml_norm(ctx0, inpFF); + cur = ggml_norm(ctx0, inpFF, default_norm_eps); // cur = ln_2_g*cur + ln_2_b // [ 768, N] @@ -683,7 +691,7 @@ bool gpt2_eval( // norm { // [ 768, N] - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_norm(ctx0, inpL, default_norm_eps); // inpL = ln_f_g*inpL + ln_f_b // [ 768, N] diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 100ca1ace5dc6f18ec50212248b21ecd81d6a8a6..97e885016e264038eb35d969aa666d17e2378e57 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -150,7 +150,7 @@ ModelLoadResult gptj_v2_model_load(const std::string & fname, gptj_v2_model & mo params.mem_size = ctx_size; params.mem_buffer = NULL; params.no_alloc = false; - + model.ctx = ggml_v2_init(params); if (!model.ctx) { @@ -281,7 +281,7 @@ ModelLoadResult gptj_v2_model_load(const std::string & fname, gptj_v2_model & mo fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return ModelLoadResult::FAIL; } - + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { @@ -294,11 +294,11 @@ ModelLoadResult gptj_v2_model_load(const std::string & fname, gptj_v2_model & mo } else { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%ld, %ld], expected [%d, %d]\n", __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); return ModelLoadResult::FAIL; } - + } // for debugging @@ -387,7 +387,7 @@ bool gptj_v2_eval( params.mem_size = buf_size; params.mem_buffer = buf; params.no_alloc = false; - + struct ggml_v2_context * ctx0 = ggml_v2_init(params); struct ggml_v2_cgraph gf = {}; diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 1cc566208f7809f31463a6104ed5370c1fe136bd..1001e9b842e7f6b32ab8409b79031b7f87020f97 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -304,7 +304,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g } else { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%ld, %ld], expected [%d, %d]\n", __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); return ModelLoadResult::FAIL; } @@ -348,7 +348,11 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g const auto & hparams = model.hparams; size_t vram_total = 0; const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); - fprintf(stderr, "%s: [GPU] offloading %d layers to GPU\n", __func__, n_gpu); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + #else + fprintf(stderr, "%s: [CUDA] offloading %d layers to GPU\n", __func__, n_gpu); + #endif for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU; @@ -373,7 +377,11 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); #endif } - fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #else + fprintf(stderr, "%s: [CUDA] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #endif } #endif @@ -464,7 +472,7 @@ bool gptj_eval( // norm { - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, default_norm_eps); // cur = ln_1_g*cur + ln_1_b cur = ggml_add(ctx0, @@ -594,7 +602,7 @@ bool gptj_eval( // norm { - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_norm(ctx0, inpL, default_norm_eps); // inpL = ln_f_g*inpL + ln_f_b inpL = ggml_add(ctx0, @@ -644,4 +652,4 @@ bool gptj_eval( ggml_free(ctx0); return true; -} \ No newline at end of file +} diff --git a/otherarch/llama-util.h b/otherarch/llama-util.h new file mode 100644 index 0000000000000000000000000000000000000000..e1986eb268564943486452bda4a2f3cabe441630 --- /dev/null +++ b/otherarch/llama-util.h @@ -0,0 +1,557 @@ +// Internal header to be included only by llama.cpp. +// Contains wrappers around OS interfaces. +#pragma once +#ifndef LLAMA_V3_UTIL_H +#define LLAMA_V3_UTIL_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include + #include // for _fseeki64 +#endif + +#define LLAMA_V3_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "LLAMA_V3_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +#ifdef __GNUC__ +#ifdef __MINGW32__ +__attribute__((format_old(gnu_printf, 1, 2))) +#else +__attribute__((format_old(printf, 1, 2))) +#endif +#endif +static std::string format_old(const char * fmt, ...) { + va_list ap, ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + LLAMA_V3_ASSERT(size >= 0 && size < INT_MAX); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + LLAMA_V3_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +struct llama_v3_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_v3_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format_old("failed to open %s: %s", fname, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + LLAMA_V3_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + LLAMA_V3_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, len, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format_old("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error(std::string("unexpectedly reached end of file")); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, len, 1, fp); + if (ret != 1) { + throw std::runtime_error(format_old("write error: %s", strerror(errno))); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_v3_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +// llama_v3_context_data +struct llama_v3_data_context { + virtual void write(const void * src, size_t size) = 0; + virtual size_t get_size_written() = 0; + virtual ~llama_v3_data_context() = default; +}; + +struct llama_v3_data_buffer_context : llama_v3_data_context { + uint8_t* ptr; + size_t size_written = 0; + + llama_v3_data_buffer_context(uint8_t * p) : ptr(p) {} + + void write(const void * src, size_t size) override { + memcpy(ptr, src, size); + ptr += size; + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +struct llama_v3_data_file_context : llama_v3_data_context { + llama_v3_file* file; + size_t size_written = 0; + + llama_v3_data_file_context(llama_v3_file * f) : file(f) {} + + void write(const void * src, size_t size) override { + file->write_raw(src, size); + size_written += size; + } + + size_t get_size_written() override { + return size_written; + } +}; + +#if defined(_WIN32) +static std::string llama_v3_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; +} +#endif + +struct llama_v3_mmap { + void * addr; + size_t size; + + llama_v3_mmap(const llama_v3_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + llama_v3_mmap(struct llama_v3_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; + // prefetch/readahead impairs performance on NUMA systems + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (prefetch >= file->size) { flags |= MAP_POPULATE; } +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error(format_old("mmap failed: %s", strerror(errno))); + } + + if (prefetch > 0) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + if (numa) { + // advise the kernel not to use readahead + // (because the next page might not belong on the same node) + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + } + + ~llama_v3_mmap() { + munmap(addr, size); + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + llama_v3_mmap(struct llama_v3_file * file, bool prefetch = true, bool numa = false) { + (void) numa; + + size = file->size; + + HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + DWORD error = GetLastError(); + + if (hMapping == NULL) { + throw std::runtime_error(format_old("CreateFileMappingA failed: %s", llama_v3_format_win_err(error).c_str())); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw std::runtime_error(format_old("MapViewOfFile failed: %s", llama_v3_format_win_err(error).c_str())); + } + + #ifndef USE_FAILSAFE + if (prefetch) { + // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we + // will dynamically load it using GetProcAddress. + BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); + HMODULE hKernel32; + + // This call is guaranteed to succeed. + hKernel32 = GetModuleHandleW(L"kernel32.dll"); + + // This call may fail if on a pre-Win8 system. + pPrefetchVirtualMemory = reinterpret_cast (GetProcAddress(hKernel32, "PrefetchVirtualMemory")); + + if (pPrefetchVirtualMemory) { + // Advise the kernel to preload the mapped memory. + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_v3_format_win_err(GetLastError()).c_str()); + } + } + } + #else + printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n"); + #endif + } + + ~llama_v3_mmap() { + if (!UnmapViewOfFile(addr)) { + fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + llama_v3_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + llama_v3_mmap(struct llama_v3_file *, bool prefetch = true, bool numa = false) { + (void) prefetch; + (void) numa; + + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +// Represents some region of memory being locked using mlock or VirtualLock; +// will automatically unlock on destruction. +struct llama_v3_mlock { + void * addr = NULL; + size_t size = 0; + bool failed_already = false; + + llama_v3_mlock() {} + llama_v3_mlock(const llama_v3_mlock &) = delete; + + ~llama_v3_mlock() { + if (size) { + raw_unlock(addr, size); + } + } + + void init(void * ptr) { + LLAMA_V3_ASSERT(addr == NULL && size == 0); + addr = ptr; + } + + void grow_to(size_t target_size) { + LLAMA_V3_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + #ifdef __APPLE__ + #define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + #else + #define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + #endif + + bool raw_lock(const void * addr, size_t size) { + if (!mlock(addr, size)) { + return true; + } else { + char* errmsg = std::strerror(errno); + bool suggest = (errno == ENOMEM); + + // Check if the resource limit is fine after all + struct rlimit lock_limit; + if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) + suggest = false; + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) + suggest = false; + + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + return false; + } + } + + #undef MLOCK_SUGGESTION + + void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * ptr, size_t len) { + for (int tries = 1; ; tries++) { + if (VirtualLock(ptr, len)) { + return true; + } + if (tries == 2) { + fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + len, size, llama_v3_format_win_err(GetLastError()).c_str()); + return false; + } + + // It failed but this was only the first try; increase the working + // set size and try again. + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + llama_v3_format_win_err(GetLastError()).c_str()); + return false; + } + // Per MSDN: "The maximum number of pages that a process can lock + // is equal to the number of pages in its minimum working set minus + // a small overhead." + // Hopefully a megabyte is enough overhead: + size_t increment = len + 1048576; + // The minimum must be <= the maximum, so we need to increase both: + min_ws_size += increment; + max_ws_size += increment; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + llama_v3_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + void raw_unlock(void * ptr, size_t len) { + if (!VirtualUnlock(ptr, len)) { + fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + llama_v3_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + size_t lock_granularity() { + return (size_t) 65536; + } + + bool raw_lock(const void * addr, size_t len) { + fprintf(stderr, "warning: mlock not supported on this system\n"); + return false; + } + + void raw_unlock(const void * addr, size_t len) {} +#endif +}; + +// Replacement for std::vector that doesn't require zero-initialization. +struct llama_v3_buffer { + uint8_t * addr = NULL; + size_t size = 0; + + llama_v3_buffer() = default; + + void resize(size_t len) { +#ifdef GGML_USE_METAL + free(addr); + int result = posix_memalign((void **) &addr, getpagesize(), len); + if (result == 0) { + memset(addr, 0, len); + } + else { + addr = NULL; + } +#else + delete[] addr; + addr = new uint8_t[len]; +#endif + size = len; + } + + ~llama_v3_buffer() { +#ifdef GGML_USE_METAL + free(addr); +#else + delete[] addr; +#endif + addr = NULL; + } + + // disable copy and move + llama_v3_buffer(const llama_v3_buffer&) = delete; + llama_v3_buffer(llama_v3_buffer&&) = delete; + llama_v3_buffer& operator=(const llama_v3_buffer&) = delete; + llama_v3_buffer& operator=(llama_v3_buffer&&) = delete; +}; + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +struct llama_v3_ctx_buffer { + uint8_t * addr = NULL; + bool is_cuda; + size_t size = 0; + + llama_v3_ctx_buffer() = default; + + void resize(size_t size) { + free(); + + addr = (uint8_t *) ggml_cuda_host_malloc(size); + if (addr) { + is_cuda = true; + } + else { + // fall back to pageable memory + addr = new uint8_t[size]; + is_cuda = false; + } + this->size = size; + } + + void free() { + if (addr) { + if (is_cuda) { + ggml_cuda_host_free(addr); + } + else { + delete[] addr; + } + } + addr = NULL; + } + + ~llama_v3_ctx_buffer() { + free(); + } + + // disable copy and move + llama_v3_ctx_buffer(const llama_v3_ctx_buffer&) = delete; + llama_v3_ctx_buffer(llama_v3_ctx_buffer&&) = delete; + llama_v3_ctx_buffer& operator=(const llama_v3_ctx_buffer&) = delete; + llama_v3_ctx_buffer& operator=(llama_v3_ctx_buffer&&) = delete; +}; +#else +typedef llama_v3_buffer llama_v3_ctx_buffer; +#endif + +#endif diff --git a/otherarch/llama_v2-util.h b/otherarch/llama_v2-util.h index 63942a007301b5d17d063f9a1e5b59c9b1a0ffd7..41b6df386931abeb09693c975c520230d82fd20e 100644 --- a/otherarch/llama_v2-util.h +++ b/otherarch/llama_v2-util.h @@ -50,9 +50,9 @@ #ifdef __GNUC__ #ifdef __MINGW32__ -__attribute__((format(gnu_printf, 1, 2))) +__attribute__((format_old(gnu_printf, 1, 2))) #else -__attribute__((format(printf, 1, 2))) +__attribute__((format_old(printf, 1, 2))) #endif #endif @@ -65,7 +65,7 @@ struct llama_v2_file { llama_v2_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); if (fp == NULL) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + throw std::runtime_error(format_old("failed to open %s: %s", fname, strerror(errno))); } seek(0, SEEK_END); size = tell(); @@ -98,7 +98,7 @@ struct llama_v2_file { errno = 0; std::size_t ret = std::fread(ptr, size, 1, fp); if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); + throw std::runtime_error(format_old("read error: %s", strerror(errno))); } if (ret != 1) { throw std::runtime_error(std::string("unexpectedly reached end of file")); @@ -124,7 +124,7 @@ struct llama_v2_file { errno = 0; size_t ret = std::fwrite(ptr, size, 1, fp); if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); + throw std::runtime_error(format_old("write error: %s", strerror(errno))); } } @@ -171,7 +171,7 @@ struct llama_v2_mmap { #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { - throw std::runtime_error(format("mmap failed: %s", strerror(errno))); + throw std::runtime_error(format_old("mmap failed: %s", strerror(errno))); } if (prefetch) { @@ -198,7 +198,7 @@ struct llama_v2_mmap { DWORD error = GetLastError(); if (hMapping == NULL) { - throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_v2_format_win_err(error).c_str())); + throw std::runtime_error(format_old("CreateFileMappingA failed: %s", llama_v2_format_win_err(error).c_str())); } addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); @@ -206,7 +206,7 @@ struct llama_v2_mmap { CloseHandle(hMapping); if (addr == NULL) { - throw std::runtime_error(format("MapViewOfFile failed: %s", llama_v2_format_win_err(error).c_str())); + throw std::runtime_error(format_old("MapViewOfFile failed: %s", llama_v2_format_win_err(error).c_str())); } #ifndef USE_FAILSAFE diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp index ce7e2f771554d9438507622f6be9066099992dc6..01b47697ce8776103bb899cfd0389aa6f6a88026 100644 --- a/otherarch/llama_v2.cpp +++ b/otherarch/llama_v2.cpp @@ -282,7 +282,7 @@ template static T checked_mul2(T a, T b) { T ret = a * b; if (a != 0 && ret / a != b) { - throw format("overflow multiplying %llu * %llu", + throw format_old("overflow multiplying %llu * %llu", (unsigned long long) a, (unsigned long long) b); } return ret; @@ -290,7 +290,7 @@ static T checked_mul2(T a, T b) { static size_t checked_div2(size_t a, size_t b) { if (b == 0 || a % b != 0) { - throw format("error dividing %zu / %zu", a, b); + throw format_old("error dividing %zu / %zu", a, b); } return a / b; } @@ -354,7 +354,7 @@ struct llama_v2_load_tensor { const auto & first_shard = shards.at(0); for (const auto & shard : shards) { if (shard.type != first_shard.type) { - throw format("inconsistent tensor shard type in '%s'", name.c_str()); + throw format_old("inconsistent tensor shard type in '%s'", name.c_str()); } } type = first_shard.type; @@ -377,7 +377,7 @@ struct llama_v2_load_tensor { const auto & first_shard = shards.at(0); for (const auto & shard : shards) { if (shard.ne != first_shard.ne) { - throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", + throw format_old("inconsistent tensor shard shape in '%s': first was %s, other was %s", name.c_str(), llama_v2_format_tensor_shape(first_shard.ne).c_str(), llama_v2_format_tensor_shape(shard.ne).c_str()); } } @@ -451,7 +451,7 @@ struct llama_v2_file_loader { } else if (magic == 'ggjt' && version == 3) { file_version = LLAMA_V2_FILE_VERSION_GGJT_V3; } else { - throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + throw format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", magic, version); } } @@ -500,7 +500,7 @@ struct llama_v2_file_loader { file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); std::string name = file.read_string(name_len); if (n_dims < 1 || n_dims > 2) { - throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); + throw format_old("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); } switch (shard.type) { case GGML_V2_TYPE_F32: @@ -514,7 +514,7 @@ struct llama_v2_file_loader { case GGML_V2_TYPE_Q8_0: break; default: { - throw format("unrecognized tensor type %u\n", shard.type); + throw format_old("unrecognized tensor type %u\n", shard.type); } } @@ -620,7 +620,7 @@ struct llama_v2_model_loader { auto * ith_file = new llama_v2_file_loader(fname.c_str(), i, tensors_map); file_loaders.emplace_back(ith_file); if (ith_file->hparams != first_file->hparams) { - throw format("llama.cpp: hparams inconsistent between files"); + throw format_old("llama.cpp: hparams inconsistent between files"); } } if (!llama_v2_mmap::SUPPORTED) { @@ -667,11 +667,11 @@ struct llama_v2_model_loader { struct ggml_v2_tensor * get_tensor(const std::string & name, const std::vector & ne) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { - throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); + throw format_old("llama.cpp: tensor '%s' is missing from model", name.c_str()); } llama_v2_load_tensor & lt = tensors_map.tensors.at(it->second); if (lt.ne != ne) { - throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", + throw format_old("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", name.c_str(), llama_v2_format_tensor_shape(ne).c_str(), llama_v2_format_tensor_shape(lt.ne).c_str()); } @@ -1016,7 +1016,7 @@ static void llama_v2_model_load_internal( model.ctx = ggml_v2_init(params); if (!model.ctx) { - throw format("ggml_v2_init() failed"); + throw format_old("ggml_v2_init() failed"); } } @@ -2042,7 +2042,7 @@ static void llama_v2_model_quantize_internal(const std::string & fname_inp, cons case LLAMA_V2_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_V2_TYPE_Q5_0; break; case LLAMA_V2_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_V2_TYPE_Q5_1; break; case LLAMA_V2_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_V2_TYPE_Q8_0; break; - default: throw format("invalid output file type %d\n", ftype); + default: throw format_old("invalid output file type %d\n", ftype); }; if (nthread <= 0) { @@ -2108,7 +2108,7 @@ static void llama_v2_model_quantize_internal(const std::string & fname_inp, cons f32_data[i] = ggml_v2_fp16_to_fp32(f16_data[i]); } } else { - throw format("type %s unsupported for integer quantization", ggml_v2_type_name(tensor.type)); + throw format_old("type %s unsupported for integer quantization", ggml_v2_type_name(tensor.type)); } printf("quantizing .. "); @@ -3101,4 +3101,4 @@ std::vector llama_v2_tokenize(struct llama_v2_context * ctx, const res.resize(n); return res; -} \ No newline at end of file +} diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..609b2bfa3d537ca528bf5fe84ffa1fefed2ce3e2 --- /dev/null +++ b/otherarch/llama_v3.cpp @@ -0,0 +1,4494 @@ +// Defines fileno on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#include +#include +#include +#endif + +#include "llama-util.h" +#include "llama_v3.h" + +#include "ggml.h" +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif +#if defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif +#ifdef GGML_USE_MPI +#include "ggml-mpi.h" +#endif +#ifdef GGML_USE_K_QUANTS +#ifndef QK_K +#ifdef GGML_QKK_64 +#define QK_K 64 +#else +#define QK_K 256 +#endif +#endif +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static void llama_v3_log_internal(llama_v3_log_level level, const char* format, ...); +static void llama_v3_log_callback_default(llama_v3_log_level level, const char * text, void * user_data); +#define LLAMA_V3_LOG_INFO(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_INFO , __VA_ARGS__) +#define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__) +#define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__) + + +#if !defined(GGML_USE_CUBLAS) +#include "ggml-alloc.h" +#define LLAMA_V3_USE_ALLOCATOR +#else +#define LLAMA_V3_USE_SCRATCH +#define LLAMA_V3_MAX_SCRATCH_BUFFERS 16 +#endif + + +// available llama models +enum e_model3 { + MODEL_UNKNOWN_3, + MODEL_3B_3, + MODEL_7B_3, + MODEL_13B_3, + MODEL_30B_3, + MODEL_34B_3, + MODEL_65B_3, + MODEL_70B_3, +}; + +static const size_t kB3 = 1024; +static const size_t MB3 = 1024*1024; + +// computed for n_ctx == 2048 +// TODO: dynamically determine these sizes +// needs modifications in ggml + +typedef void (*offload_func_t)(struct ggml_tensor * tensor); + +void llama_v3_nop(struct ggml_tensor * tensor) { // don't offload by default + (void) tensor; +} + +// +// ggml helpers +// + +static void llv3_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + + +// +// memory sizes (calculated for n_batch == 512) +// + +static std::map MEM_REQ_SCRATCH0_3(int n_ctx) +{ + std::map k_sizes = { + { MODEL_3B_3, ((size_t) n_ctx / 16ull + 156ull) * MB3 }, + { MODEL_7B_3, ((size_t) n_ctx / 16ull + 164ull) * MB3 }, + { MODEL_13B_3, ((size_t) n_ctx / 12ull + 184ull) * MB3 }, + { MODEL_30B_3, ((size_t) n_ctx / 9ull + 224ull) * MB3 }, + { MODEL_34B_3, ((size_t) n_ctx / 8ull + 256ull) * MB3 }, // guess + { MODEL_65B_3, ((size_t) n_ctx / 6ull + 320ull) * MB3 }, // guess + { MODEL_70B_3, ((size_t) n_ctx / 7ull + 320ull) * MB3 }, + }; + return k_sizes; +} + +static const std::map & MEM_REQ_SCRATCH1_3() +{ + static std::map k_sizes = { + { MODEL_3B_3, 192ull * MB3 }, + { MODEL_7B_3, 224ull * MB3 }, + { MODEL_13B_3, 256ull * MB3 }, + { MODEL_30B_3, 320ull * MB3 }, + { MODEL_34B_3, 380ull * MB3 }, // guess + { MODEL_65B_3, 448ull * MB3 }, // guess + { MODEL_70B_3, 448ull * MB3 }, + }; + return k_sizes; +} + +// used to store the compute graph tensors + non-scratch data +static const std::map & MEM_REQ_EVAL_3() +{ + static std::map k_sizes = { + { MODEL_3B_3, 16ull * MB3 }, + { MODEL_7B_3, 20ull * MB3 }, + { MODEL_13B_3, 24ull * MB3 }, + { MODEL_30B_3, 32ull * MB3 }, + { MODEL_34B_3, 38ull * MB3 }, // guess + { MODEL_65B_3, 48ull * MB3 }, // guess + { MODEL_70B_3, 48ull * MB3 }, + }; + return k_sizes; +} + +// amount of VRAM needed per batch size to hold temporary results +// the values for 3b are not derived from testing but instead chosen conservatively +static const std::map & VRAM_REQ_SCRATCH_BASE_3() +{ + static std::map k_sizes = { + { MODEL_3B_3, 512ull * kB3 }, + { MODEL_7B_3, 512ull * kB3 }, + { MODEL_13B_3, 640ull * kB3 }, + { MODEL_30B_3, 768ull * kB3 }, + { MODEL_34B_3, 960ull * kB3 }, + { MODEL_65B_3, 1360ull * kB3 }, + { MODEL_70B_3, 1360ull * kB3 }, + }; + return k_sizes; +} + +// amount of VRAM needed per batch size and context to hold temporary results +// the values for 3b are not derived from testing but instead chosen conservatively +static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT_3() +{ + static std::map k_sizes = { + { MODEL_3B_3, 128ull }, + { MODEL_7B_3, 128ull }, + { MODEL_13B_3, 160ull }, + { MODEL_30B_3, 208ull }, + { MODEL_34B_3, 256ull }, + { MODEL_65B_3, 320ull }, + { MODEL_70B_3, 320ull }, + }; + return k_sizes; +} + +// default hparams (LLaMA 7B) +struct llama_v3_hparams { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 256; + uint32_t n_head = 32; + uint32_t n_head_kv = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + + // LLaMAv2 + // TODO: load from model data hparams + float f_ffn_mult = 1.0f; + float f_rms_norm_eps = LLAMA_V3_DEFAULT_RMS_EPS; + + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; + + enum llama_v3_ftype ftype = LLAMA_V3_FTYPE_MOSTLY_F16; + + bool operator!=(const llama_v3_hparams & other) const { + return static_cast(memcmp(this, &other, sizeof(llama_v3_hparams))); // NOLINT + } + + uint32_t n_gqa() const { + return n_head/n_head_kv; + } + + uint32_t n_embd_head() const { + return n_embd/n_head; + } + + uint32_t n_embd_gqa() const { + return n_embd/n_gqa(); + } + + size_t kv_size() const { + size_t result = 2ull; + result *= (size_t) n_embd_gqa(); + result *= (size_t) n_ctx; + result *= (size_t) n_layer; + result *= sizeof(ggml_fp16_t); + return result; + } +}; + +struct llama_v3_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + +struct llama_v3_kv_cache { + struct ggml_tensor * k = NULL; + struct ggml_tensor * v = NULL; + + struct ggml_context * ctx = NULL; + + llama_v3_ctx_buffer buf; + + int n; // number of tokens currently in the cache + + ~llama_v3_kv_cache() { + if (ctx) { + ggml_free(ctx); + } + +#ifdef GGML_USE_CUBLAS + ggml_cuda_free_data(k); + ggml_cuda_free_data(v); +#endif // GGML_USE_CUBLAS + } +}; + +struct llama_v3_vocab { + using id = int32_t; + using token = std::string; + + struct token_score { + token tok; + float score; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; +}; + +struct llama_v3_model { + e_model3 type = MODEL_UNKNOWN_3; + + llama_v3_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; + int n_gpu_layers; + + // context + struct ggml_context * ctx = NULL; + + // the model memory buffer + llama_v3_ctx_buffer buf; + + // model memory mapped file + std::unique_ptr mapping; + + // objects representing data potentially being locked in memory + llama_v3_mlock mlock_buf; + llama_v3_mlock mlock_mmap; + + // for quantize-stats only + std::vector> tensors_by_name; + + int64_t t_load_us = 0; + int64_t t_start_us = 0; + + llama_v3_vocab vocab; + + ~llama_v3_model() { + if (ctx) { + ggml_free(ctx); + } + +#ifdef GGML_USE_CUBLAS + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + ggml_cuda_free_data(tensors_by_name[i].second); + } + ggml_cuda_free_scratch(); +#elif defined(GGML_USE_CLBLAST) + for (size_t i = 0; i < tensors_by_name.size(); ++i) { + ggml_cl_free_data(tensors_by_name[i].second); + } +#endif + } +}; + +struct llama_v3_context { + llama_v3_context(const llama_v3_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} + ~llama_v3_context() { + if (model_owner) { + delete &model; + } +#ifdef GGML_USE_METAL + if (ctx_metal) { + ggml_metal_free(ctx_metal); + } +#endif +#ifdef LLAMA_V3_USE_ALLOCATOR + if (alloc) { + ggml_allocr_free(alloc); + } +#endif + } + + std::mt19937 rng; + + bool has_evaluated_once = false; + + int64_t t_sample_us = 0; + int64_t t_eval_us = 0; + int64_t t_p_eval_us = 0; + + int32_t n_sample = 0; // number of tokens sampled + int32_t n_eval = 0; // number of eval calls + int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) + + const llama_v3_model & model; + + bool model_owner = false; + + int64_t t_load_us; + int64_t t_start_us; + + // key + value cache for the self attention + struct llama_v3_kv_cache kv_self; + + size_t mem_per_token = 0; + + // decode output (2-dimensional array: [n_tokens][n_vocab]) + std::vector logits; + bool logits_all = false; + + // input embedding (1-dimensional array: [n_embd]) + std::vector embedding; + + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; + + // memory buffers used to evaluate the model + // TODO: move in llama_v3_state + llama_v3_ctx_buffer buf_compute; + +#ifdef LLAMA_V3_USE_ALLOCATOR + llama_v3_ctx_buffer buf_alloc; + ggml_allocr * alloc = NULL; +#endif + +#ifdef LLAMA_V3_USE_SCRATCH + llama_v3_ctx_buffer buf_scratch[LLAMA_V3_MAX_SCRATCH_BUFFERS]; + int buf_last = 0; + size_t buf_max_size[LLAMA_V3_MAX_SCRATCH_BUFFERS] = { 0 }; +#endif + +#ifdef GGML_USE_METAL + ggml_metal_context * ctx_metal = NULL; +#endif + +#ifdef GGML_USE_MPI + ggml_mpi_context * ctx_mpi = NULL; +#endif + + void use_buf(struct ggml_context * ctx, int i) { +#if defined(LLAMA_V3_USE_SCRATCH) + size_t last_size = 0; + + if (i == -1) { + last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); + } else { + auto & buf = buf_scratch[i]; + last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); + } + + if (buf_last >= 0) { + buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); + } + + buf_last = i; +#else + (void) i; + (void) ctx; +#endif + } + + size_t get_buf_max_mem(int i) const { +#if defined(LLAMA_V3_USE_SCRATCH) + return buf_max_size[i]; +#else + (void) i; + return 0; +#endif + } +}; + +struct llama_v3_state { + // We save the log callback globally + llama_v3_log_callback log_callback = llama_v3_log_callback_default; + void * log_callback_user_data = nullptr; +}; +// global state +static llama_v3_state llv3_g_state; + +template +static T checked_mul(T a, T b) { + T ret = a * b; + if (a != 0 && ret / a != b) { + throw std::runtime_error(format_old("overflow multiplying %llu * %llu", + (unsigned long long) a, (unsigned long long) b)); + } + return ret; +} + +static size_t checked_div(size_t a, size_t b) { + if (b == 0 || a % b != 0) { + throw std::runtime_error(format_old("error dividing %zu / %zu", a, b)); + } + return a / b; +} + +static std::string llama_v3_format_tensor_shape(const std::vector & ne) { + char buf[256]; + snprintf(buf, sizeof(buf), "%5u", ne.at(0)); + for (size_t i = 1; i < ne.size(); i++) { + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i)); + } + return buf; +} + +static size_t llama_v3_calc_tensor_size(const std::vector & ne, enum ggml_type type) { + size_t size = ggml_type_size(type); + for (uint32_t dim : ne) { + size = checked_mul(size, dim); + } + return size / ggml_blck_size(type); +} + +struct llama_v3_load_tensor { + std::string name; + enum ggml_type type = GGML_TYPE_F32; + std::vector ne; + size_t file_off; + size_t size; + struct ggml_tensor * ggml_tensor = NULL; + uint8_t * data; +}; + +struct llama_v3_load_tensors_map { + // tensors is kept in a separate vector to preserve file order + std::vector tensors; + std::unordered_map name_to_idx; +}; + +enum llama_v3_file_version { + LLAMA_V3_FILE_VERSION_GGML, + LLAMA_V3_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_V3_FILE_VERSION_GGJT_V1, // added padding + LLAMA_V3_FILE_VERSION_GGJT_V2, // changed quantization format + LLAMA_V3_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format +}; + +struct llama_v3_file_loader { + llama_v3_file file; + llama_v3_file_version file_version; + llama_v3_hparams hparams; + llama_v3_vocab vocab; + + llama_v3_file_loader(const char * fname, llama_v3_load_tensors_map & tensors_map) + : file(fname, "rb") { + LLAMA_V3_LOG_INFO("llama.cpp: loading model from %s\n", fname); + read_magic(); + read_hparams(); + read_vocab(); + read_tensor_metadata(tensors_map); + } + void read_magic() { + uint32_t magic = file.read_u32(); + + if (magic == LLAMA_V3_FILE_MAGIC_GGML) { + file_version = LLAMA_V3_FILE_VERSION_GGML; + return; + } + + uint32_t version = file.read_u32(); + + switch (magic) { + case LLAMA_V3_FILE_MAGIC_GGMF: + switch (version) { + case 1: file_version = LLAMA_V3_FILE_VERSION_GGMF_V1; return; + } + break; + case LLAMA_V3_FILE_MAGIC_GGJT: + switch (version) { + case 1: file_version = LLAMA_V3_FILE_VERSION_GGJT_V1; return; + case 2: file_version = LLAMA_V3_FILE_VERSION_GGJT_V2; return; + case 3: file_version = LLAMA_V3_FILE_VERSION_GGJT_V3; return; + } + } + + throw std::runtime_error(format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version)); + } + void read_hparams() { + hparams.n_vocab = file.read_u32(); + hparams.n_embd = file.read_u32(); + hparams.n_mult = file.read_u32(); + hparams.n_head = file.read_u32(); + hparams.n_layer = file.read_u32(); + hparams.n_rot = file.read_u32(); + hparams.ftype = (enum llama_v3_ftype) file.read_u32(); + + // LLaMAv2 + // TODO: read from header + hparams.n_head_kv = hparams.n_head; + } + void read_vocab() { + vocab.id_to_token.resize(hparams.n_vocab); + + for (uint32_t i = 0; i < hparams.n_vocab; i++) { + uint32_t len = file.read_u32(); + std::string word = file.read_string(len); + + float score = 0.0f; + file.read_raw(&score, sizeof(score)); + + vocab.token_to_id[word] = i; + + auto & tok_score = vocab.id_to_token[i]; + tok_score.tok = std::move(word); + tok_score.score = score; + } + } + void read_tensor_metadata(llama_v3_load_tensors_map & tensors_map) { + while (file.tell() < file.size) { + llama_v3_load_tensor tensor; + uint32_t n_dims = file.read_u32(); + uint32_t name_len = file.read_u32(); + tensor.type = (enum ggml_type) file.read_u32(); + tensor.ne.resize(n_dims); + file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims); + std::string name = file.read_string(name_len); + if (n_dims < 1 || n_dims > 2) { + throw std::runtime_error(format_old("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); + } + switch (tensor.type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + break; + default: { + throw std::runtime_error(format_old("unrecognized tensor type %u\n", tensor.type)); + } + } + + // skip to the next multiple of 32 bytes + if (file_version >= LLAMA_V3_FILE_VERSION_GGJT_V1) { + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); + } + + tensor.file_off = file.tell(); + tensor.name = name; + tensor.size = llama_v3_calc_tensor_size(tensor.ne, tensor.type); + file.seek(tensor.size, SEEK_CUR); + + tensors_map.tensors.push_back(tensor); + tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1; + } + } +}; + +struct llama_v3_file_saver { + llama_v3_file file; + llama_v3_file_loader * any_file_loader; + llama_v3_file_saver(const char * fname, llama_v3_file_loader * any_file_loader, enum llama_v3_ftype new_ftype) + : file(fname, "wb"), any_file_loader(any_file_loader) { + LLAMA_V3_LOG_INFO("llama.cpp: saving model to %s\n", fname); + write_magic(); + write_hparams(new_ftype); + write_vocab(); + } + void write_magic() { + file.write_u32(LLAMA_V3_FILE_MAGIC); // magic + file.write_u32(LLAMA_V3_FILE_VERSION); // version + } + void write_hparams(enum llama_v3_ftype new_ftype) { + const llama_v3_hparams & hparams = any_file_loader->hparams; + file.write_u32(hparams.n_vocab); + file.write_u32(hparams.n_embd); + file.write_u32(hparams.n_mult); + file.write_u32(hparams.n_head); + file.write_u32(hparams.n_layer); + file.write_u32(hparams.n_rot); + file.write_u32(new_ftype); + } + void write_vocab() { + if (any_file_loader->file_version == LLAMA_V3_FILE_VERSION_GGML) { + LLAMA_V3_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); + } + uint32_t n_vocab = any_file_loader->hparams.n_vocab; + for (uint32_t i = 0; i < n_vocab; i++) { + const auto & token_score = any_file_loader->vocab.id_to_token.at(i); + file.write_u32((uint32_t) token_score.tok.size()); + file.write_raw(token_score.tok.data(), token_score.tok.size()); + file.write_raw(&token_score.score, sizeof(token_score.score)); + } + } + void write_tensor(llama_v3_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { + switch (new_type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + break; + default: LLAMA_V3_ASSERT(false); + } + file.write_u32((uint32_t) tensor.ne.size()); + file.write_u32((uint32_t) tensor.name.size()); + file.write_u32(new_type); + file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); + file.write_raw(tensor.name.data(), tensor.name.size()); + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); + LLAMA_V3_ASSERT(new_size == llama_v3_calc_tensor_size(tensor.ne, new_type)); + file.write_raw(new_data, new_size); + } +}; + +struct llama_v3_model_loader { + std::unique_ptr file_loader; + llama_v3_load_tensors_map tensors_map; + bool use_mmap; + size_t num_ggml_tensors_created = 0; + struct ggml_context * ggml_ctx = NULL; + std::unique_ptr mapping; + + llama_v3_model_loader(const std::string & fname_base, bool use_mmap) { + file_loader = std::unique_ptr(new llama_v3_file_loader(fname_base.c_str(), tensors_map)); + if (!llama_v3_mmap::SUPPORTED) { + use_mmap = false; + } + this->use_mmap = use_mmap; + } + + void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { + *ctx_size_p = *mmapped_size_p = 0; + for (const llama_v3_load_tensor & lt : tensors_map.tensors) { + *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16; + } + } + + struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { + auto it = tensors_map.name_to_idx.find(name); + if (it == tensors_map.name_to_idx.end()) { + throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str()))); + } + llama_v3_load_tensor & lt = tensors_map.tensors.at(it->second); + if (lt.ne != ne) { + throw std::runtime_error(format_old("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", + name.c_str(), llama_v3_format_tensor_shape(ne).c_str(), llama_v3_format_tensor_shape(lt.ne).c_str())); + } + + return get_tensor_for(lt, backend); + } + + struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend backend) { + struct ggml_tensor * tensor; + if (backend != GGML_BACKEND_CPU) { + ggml_set_no_alloc(ggml_ctx, true); + } + if (lt.ne.size() == 2) { + tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); + } else { + LLAMA_V3_ASSERT(lt.ne.size() == 1); + tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); + } + ggml_set_name(tensor, lt.name.c_str()); + LLAMA_V3_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + + if (backend != GGML_BACKEND_CPU) { + ggml_set_no_alloc(ggml_ctx, use_mmap); + } + tensor->backend = backend; + lt.ggml_tensor = tensor; + num_ggml_tensors_created++; + return tensor; + } + + void done_getting_tensors() const { + if (num_ggml_tensors_created != tensors_map.tensors.size()) { + throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected")); + } + } + + void load_all_data(llama_v3_progress_callback progress_callback, void * progress_callback_user_data, llama_v3_mlock * lmlock) { + size_t data_size = 0; + size_t prefetch_size = file_loader->file.size; + size_t lock_size = 0; + for (const llama_v3_load_tensor & lt : tensors_map.tensors) { + data_size += lt.size; + if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { + prefetch_size -= lt.size; + } + } + + if (use_mmap) { + mapping.reset(new llama_v3_mmap(&file_loader->file, prefetch_size, ggml_is_numa())); + if (lmlock) { + lmlock->init(mapping->addr); + } + } + + size_t done_size = 0; + for (llama_v3_load_tensor & lt : tensors_map.tensors) { + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + LLAMA_V3_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already + lt.data = (uint8_t *) lt.ggml_tensor->data; + + // allocate temp buffer if not using mmap + if (!use_mmap && lt.data == NULL) { + GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU); + lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor)); + } + + load_data_for(lt); + + switch(lt.ggml_tensor->backend) { + case GGML_BACKEND_CPU: + lt.ggml_tensor->data = lt.data; + if (use_mmap && lmlock) { + lock_size += lt.size; + lmlock->grow_to(lock_size); + } + break; +#if defined(GGML_USE_CUBLAS) + case GGML_BACKEND_GPU: + case GGML_BACKEND_GPU_SPLIT: + ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); + if (!use_mmap) { + free(lt.data); + } + break; +#elif defined(GGML_USE_CLBLAST) + case GGML_BACKEND_GPU: + ggml_cl_transform_tensor(lt.data, lt.ggml_tensor); + if (!use_mmap) { + free(lt.data); + } + break; +#endif + default: + continue; + } + + done_size += lt.size; + } + } + + void load_data_for(llama_v3_load_tensor & lt) { + if (use_mmap) { + lt.data = (uint8_t *) mapping->addr + lt.file_off; + } else { + llama_v3_file & file = file_loader->file; + file.seek(lt.file_off, SEEK_SET); + file.read_raw(lt.data, lt.size); + } + + if (0) { + print_checksum(lt); + } + } + + static void print_checksum(llama_v3_load_tensor & lt) { + uint32_t sum = 0; + for (size_t i = 0; i < lt.size; i++) { + uint8_t byte = lt.data[i]; + sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash + } + LLAMA_V3_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, + llama_v3_format_tensor_shape(lt.ne).c_str(), lt.size); + } + +}; + +// +// kv cache +// + +static bool kv_cache_init( + const struct llama_v3_hparams & hparams, + struct llama_v3_kv_cache & cache, + ggml_type wtype, + int n_ctx, + int n_gpu_layers) { + const int n_embd = hparams.n_embd_gqa(); + const int n_layer = hparams.n_layer; + + const int64_t n_mem = n_layer*n_ctx; + const int64_t n_elements = n_embd*n_mem; + + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB3); + cache.n = 0; + + struct ggml_init_params params; + params.mem_size = cache.buf.size; + params.mem_buffer = cache.buf.addr; + params.no_alloc = false; + + cache.ctx = ggml_init(params); + + if (!cache.ctx) { + LLAMA_V3_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + + cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + ggml_set_name(cache.k, "cache_k"); + ggml_set_name(cache.v, "cache_v"); + + (void) n_gpu_layers; +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer + 1) { + ggml_cuda_assign_buffers_no_scratch(cache.v); + } + if (n_gpu_layers > n_layer + 2) { + ggml_cuda_assign_buffers_no_scratch(cache.k); + } +#endif // GGML_USE_CUBLAS + + return true; +} + +struct llama_v3_context_params llama_v3_context_default_params() { + struct llama_v3_context_params result = { + /*.seed =*/ LLAMA_V3_DEFAULT_SEED, + /*.n_ctx =*/ 512, + /*.n_batch =*/ 512, + /*.n_gqa =*/ 1, + /*.rms_norm_eps =*/ LLAMA_V3_DEFAULT_RMS_EPS, + /*.gpu_layers =*/ 0, + /*.main_gpu =*/ 0, + /*.tensor_split =*/ nullptr, + /*.rope_freq_base =*/ 10000.0f, + /*.rope_freq_scale =*/ 1.0f, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, + /*.low_vram =*/ false, + /*.mul_mat_q =*/ false, + /*.f16_kv =*/ true, + /*.logits_all =*/ false, + /*.vocab_only =*/ false, + /*.use_mmap =*/ true, + /*.use_mlock =*/ false, + /*.embedding =*/ false, + }; + + return result; +} + +struct llama_v3_model_quantize_params llama_v3_model_quantize_default_params() { + struct llama_v3_model_quantize_params result = { + /*.nthread =*/ 0, + /*.ftype =*/ LLAMA_V3_FTYPE_MOSTLY_Q5_1, + /*.allow_requantize =*/ false, + /*.quantize_output_tensor =*/ true, + }; + + return result; +} + +int llama_v3_max_devices() { + return LLAMA_V3_MAX_DEVICES; +} + +bool llama_v3_mmap_supported() { + return llama_v3_mmap::SUPPORTED; +} + +bool llama_v3_mlock_supported() { + return llama_v3_mlock::SUPPORTED; +} + +int get_blas_batch_mul3(int batch) +{ + return (batch>512?(batch>1024?4:2):1); +} + +void llama_v3_backend_init(bool numa) { + ggml_time_init(); + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } + + if (numa) { + ggml_numa_init(); + } + +#ifdef GGML_USE_MPI + ggml_mpi_backend_init(); +#endif +} + +void llama_v3_backend_free() { +#ifdef GGML_USE_MPI + ggml_mpi_backend_free(); +#endif +} + +int64_t llama_v3_time_us() { + return ggml_time_us(); +} + +// +// model loading +// + +static const char * llama_v3_file_version_name(llama_v3_file_version version) { + switch (version) { + case LLAMA_V3_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; + case LLAMA_V3_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; + case LLAMA_V3_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; + case LLAMA_V3_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; + case LLAMA_V3_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; + } + + return "unknown"; +} + +const char * llama_v3_ftype_name(enum llama_v3_ftype ftype) { + switch (ftype) { + case LLAMA_V3_FTYPE_ALL_F32: return "all F32"; + case LLAMA_V3_FTYPE_MOSTLY_F16: return "mostly F16"; + case LLAMA_V3_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; + case LLAMA_V3_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; + case LLAMA_V3_FTYPE_MOSTLY_Q4_1_SOME_F16: + return "mostly Q4_1, some F16"; + case LLAMA_V3_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; + case LLAMA_V3_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; + case LLAMA_V3_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; + // K-quants + case LLAMA_V3_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; + case LLAMA_V3_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; + case LLAMA_V3_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; + case LLAMA_V3_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; + case LLAMA_V3_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small"; + case LLAMA_V3_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; + case LLAMA_V3_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; + case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; + case LLAMA_V3_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; + default: return "unknown, may not work"; + } +} + +static const char * llama_v3_model_type_name(e_model3 type) { + switch (type) { + case MODEL_3B_3: return "3B"; + case MODEL_7B_3: return "7B"; + case MODEL_13B_3: return "13B"; + case MODEL_30B_3: return "30B"; + case MODEL_34B_3: return "34B"; + case MODEL_65B_3: return "65B"; + case MODEL_70B_3: return "70B"; + default: LLAMA_V3_ASSERT(false); + } +} + +static void llama_v3_model_load_internal( + const std::string & fname, + llama_v3_model & model, + llama_v3_vocab & vocab, + int n_ctx, + int n_batch, + int n_gqa, + float rms_norm_eps, + int n_gpu_layers, + int main_gpu, + const float * tensor_split, + const bool mul_mat_q, + float rope_freq_base, + float rope_freq_scale, + bool low_vram, + ggml_type memory_type, + bool use_mmap, + bool use_mlock, + bool vocab_only, + llama_v3_progress_callback progress_callback, + void * progress_callback_user_data) { + + model.t_start_us = ggml_time_us(); + size_t blasbatchmul = get_blas_batch_mul3(n_batch); + + std::unique_ptr ml(new llama_v3_model_loader(fname, use_mmap)); + + vocab = std::move(ml->file_loader->vocab); + model.hparams = ml->file_loader->hparams; + model.n_gpu_layers = n_gpu_layers; + llama_v3_file_version file_version = ml->file_loader->file_version; + + auto & hparams = model.hparams; + + // TODO: read from file + hparams.f_rms_norm_eps = rms_norm_eps; + + { + switch (hparams.n_layer) { + case 26: model.type = e_model3::MODEL_3B_3; break; + case 32: model.type = e_model3::MODEL_7B_3; break; + case 40: model.type = e_model3::MODEL_13B_3; break; + case 48: model.type = e_model3::MODEL_34B_3; break; + case 60: model.type = e_model3::MODEL_30B_3; break; + case 80: model.type = e_model3::MODEL_65B_3; break; + default: + { + if (hparams.n_layer < 32) { + model.type = e_model3::MODEL_7B_3; + } + } break; + } + + hparams.n_ctx = n_ctx; + + // LLaMAv2 + // TODO: temporary until GGUF + //patch for llama2 gqa + if (model.type == e_model3::MODEL_65B_3 && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) { + fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__); + n_gqa = 8; + } + + if (model.type == e_model3::MODEL_34B_3) { + fprintf(stderr, "%s: Applying KCPP Patch for 34B model, setting GQA to 8\n", __func__); + n_gqa = 8; + } + LLAMA_V3_ASSERT(hparams.n_head % n_gqa == 0); + hparams.n_head_kv = hparams.n_head / n_gqa; + if (model.type == e_model3::MODEL_65B_3 && n_gqa == 8) { + LLAMA_V3_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); + model.type = e_model3::MODEL_70B_3; + hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model + } + + hparams.rope_freq_base = rope_freq_base; + hparams.rope_freq_scale = rope_freq_scale; + } + + // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199 + const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3; + const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw; + const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; + //const uint32_t n_ff = 28672; + + { + LLAMA_V3_LOG_INFO("%s: format = %s\n", __func__, llama_v3_file_version_name(file_version)); + LLAMA_V3_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); + LLAMA_V3_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); + LLAMA_V3_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_V3_LOG_INFO("%s: n_mult = %u\n", __func__, hparams.n_mult); + LLAMA_V3_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); + LLAMA_V3_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + LLAMA_V3_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); + LLAMA_V3_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_V3_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + LLAMA_V3_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); + LLAMA_V3_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff); + LLAMA_V3_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + LLAMA_V3_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); + LLAMA_V3_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_v3_ftype_name(hparams.ftype)); + LLAMA_V3_LOG_INFO("%s: model size = %s\n", __func__, llama_v3_model_type_name(model.type)); + } + + if (file_version < LLAMA_V3_FILE_VERSION_GGJT_V2) { + if (hparams.ftype != LLAMA_V3_FTYPE_ALL_F32 && + hparams.ftype != LLAMA_V3_FTYPE_MOSTLY_F16 && + hparams.ftype != LLAMA_V3_FTYPE_MOSTLY_Q8_0) { + printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"); + } + } + + if (file_version < LLAMA_V3_FILE_VERSION_GGJT_V3) { + if (hparams.ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_0 || + hparams.ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_1 || + hparams.ftype == LLAMA_V3_FTYPE_MOSTLY_Q8_0) { + printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"); + } + } + + if (vocab_only) { + return; + } + + auto & ctx = model.ctx; + + size_t ctx_size; + size_t mmapped_size; + ml->calc_sizes(&ctx_size, &mmapped_size); + LLAMA_V3_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); + + // create the ggml context + { + model.buf.resize(ctx_size); + if (use_mlock) { + model.mlock_buf.init (model.buf.addr); + model.mlock_buf.grow_to(model.buf.size); + } + + struct ggml_init_params params = { + /*.mem_size =*/ model.buf.size, + /*.mem_buffer =*/ model.buf.addr, + /*.no_alloc =*/ ml->use_mmap, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + throw std::runtime_error(format_old("ggml_init() failed")); + } + } + + (void) main_gpu; + (void) mul_mat_q; +#if defined(GGML_USE_CUBLAS) + LLAMA_V3_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__); + ggml_cuda_set_main_device(main_gpu); + ggml_cuda_set_mul_mat_q(mul_mat_q); +#define LLAMA_V3_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT +#elif defined(GGML_USE_CLBLAST) + LLAMA_V3_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); +#define LLAMA_V3_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU +#else +#define LLAMA_V3_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define LLAMA_V3_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU +#endif + + // prepare memory for the weights + size_t vram_weights = 0; + size_t vram_scratch = 0; + { + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd_gqa = hparams.n_embd_gqa(); + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + + ml->ggml_ctx = ctx; + + model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); + + // "output" tensor + { + ggml_backend backend_norm; + ggml_backend backend_output; + if (n_gpu_layers > int(n_layer)) { // NOLINT + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_V3_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm); + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT + const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); + + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); + + layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); + + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } + } + } + + ml->done_getting_tensors(); + + // print memory requirements + { + const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; + + // this is the total memory required to run the inference + size_t mem_required = + ctx_size + + mmapped_size - vram_weights; // weights in VRAM not in memory + +#ifndef LLAMA_V3_USE_ALLOCATOR + mem_required += + blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(model.type) + + blasbatchmul*MEM_REQ_SCRATCH1_3().at(model.type) + + blasbatchmul*MEM_REQ_EVAL_3().at(model.type); +#endif + + // this is the memory required by one llama_v3_state + const size_t mem_required_state = + scale*hparams.kv_size(); + + LLAMA_V3_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); + + (void) vram_scratch; + (void) n_batch; +#ifdef GGML_USE_CUBLAS + if (low_vram) { + LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); + ggml_cuda_set_scratch_size(0); // disable scratch + } else { + const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE_3().at(model.type); + const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT_3().at(model.type); + vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context); + ggml_cuda_set_scratch_size(vram_scratch); + if (n_gpu_layers > 0) { + LLAMA_V3_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n", + __func__, vram_scratch_base / kB3, vram_scratch_per_context, + (vram_scratch + MB3 - 1) / MB3); // round up + } + } +#endif // GGML_USE_CUBLAS + +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + LLAMA_V3_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); + if (n_gpu_layers > (int) hparams.n_layer) { + LLAMA_V3_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); + } + size_t vram_kv_cache = 0; + +#ifdef GGML_USE_CUBLAS + const int max_backend_supported_layers = hparams.n_layer + 3; + const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; + if (n_gpu_layers > (int) hparams.n_layer + 1) { + if (low_vram) { + LLAMA_V3_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); + } else { + LLAMA_V3_LOG_INFO("%s: offloading v cache to GPU\n", __func__); + vram_kv_cache += hparams.kv_size() / 2; + } + } + if (n_gpu_layers > (int) hparams.n_layer + 2) { + if (low_vram) { + LLAMA_V3_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); + } else { + LLAMA_V3_LOG_INFO("%s: offloading k cache to GPU\n", __func__); + vram_kv_cache += hparams.kv_size() / 2; + } + } +#elif defined(GGML_USE_CLBLAST) + const int max_backend_supported_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; +#endif // GGML_USE_CUBLAS + + LLAMA_V3_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", + __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); + LLAMA_V3_LOG_INFO("%s: total VRAM used: %zu MB\n", + __func__, (vram_weights + vram_scratch + vram_kv_cache + MB3 - 1) / MB3); // round up +#else + (void) n_gpu_layers; +#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) + } + + // populate `tensors_by_name` + for (llama_v3_load_tensor & lt : ml->tensors_map.tensors) { + model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); + } + + (void) tensor_split; +#if defined(GGML_USE_CUBLAS) + { + ggml_cuda_set_tensor_split(tensor_split); + } +#endif + + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); + + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + + model.mapping = std::move(ml->mapping); + + // loading time will be recalculate after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = ggml_time_us() - model.t_start_us; +} + +static bool llama_v3_model_load( + const std::string & fname, + llama_v3_model & model, + llama_v3_vocab & vocab, + int n_ctx, + int n_batch, + int n_gqa, + float rms_norm_eps, + int n_gpu_layers, + int main_gpu, + const float * tensor_split, + const bool mul_mat_q, + float rope_freq_base, + float rope_freq_scale, + bool low_vram, + ggml_type memory_type, + bool use_mmap, + bool use_mlock, + bool vocab_only, + llama_v3_progress_callback progress_callback, + void *progress_callback_user_data) { + try { + llama_v3_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, + main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, + use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); + return true; + } catch (const std::exception & err) { + LLAMA_V3_LOG_ERROR("error loading model: %s\n", err.what()); + return false; + } +} + +static struct ggml_cgraph * llama_v3_build_graph( + llama_v3_context & lctx, + const llama_v3_token * tokens, + const float * embd, + int n_tokens, + int n_past) { + + LLAMA_V3_ASSERT((!tokens && embd) || (tokens && !embd)); + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + LLAMA_V3_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + LLAMA_V3_ASSERT(n_embd_head == hparams.n_rot); + + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float rms_norm_eps = hparams.f_rms_norm_eps; + + const int n_gpu_layers = model.n_gpu_layers; + + auto & mem_per_token = lctx.mem_per_token; + auto & buf_compute = lctx.buf_compute; + + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, + /*.no_alloc =*/ false, + }; + +#ifdef LLAMA_V3_USE_ALLOCATOR + params.no_alloc = true; +#endif + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + if (tokens) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + +#ifdef LLAMA_V3_USE_ALLOCATOR + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); + } +#else + memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); +#endif + ggml_set_name(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + +#ifdef LLAMA_V3_USE_ALLOCATOR + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); + } +#else + memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); +#endif + } + + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; + + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + // + // with the low VRAM option VRAM scratch is disabled in llama_v3_load_model_internal + // in that case ggml_cuda_assign_buffers has no effect + offload_func_t offload_func_nr = llama_v3_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_v3_nop; + offload_func_t offload_func_v = llama_v3_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers; + } +#endif // GGML_USE_CUBLAS + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); +#ifdef LLAMA_V3_USE_ALLOCATOR + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } +#else + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); +#endif + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + + for (int il = 0; il < n_layer; ++il) { + ggml_format_name(inpL, "layer_inp_%d", il); + + offload_func_t offload_func = llama_v3_nop; + +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers; + } +#endif // GGML_USE_CUBLAS + + struct ggml_tensor * inpSA = inpL; + + lctx.use_buf(ctx0, 0); + + // norm + { + cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + offload_func(cur); + ggml_set_name(cur, "rms_norm_0"); + + // cur = cur*attention_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); + offload_func(cur); + ggml_set_name(cur, "attention_norm_0"); + } + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + offload_func_kq(tmpk); + ggml_set_name(tmpk, "tmpk"); + + struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + offload_func_kq(tmpq); + ggml_set_name(tmpq, "tmpq"); + + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + offload_func_kq(Kcur); + ggml_set_name(Kcur, "Kcur"); + + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + offload_func_kq(Qcur); + ggml_set_name(Qcur, "Qcur"); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + + struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + offload_func_v(tmpv); + ggml_set_name(tmpv, "tmpv"); + + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); + offload_func_v(Vcur); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); + offload_func_kq(k); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); + offload_func_v(v); + ggml_set_name(v, "v"); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + offload_func_kq(Q); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_past + N, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); + ggml_set_name(K, "K"); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + ggml_set_name(KQ, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); + ggml_set_name(KQ_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_past + N, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + offload_func_v(V); + ggml_set_name(V, "V"); + +#if 1 + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + ggml_set_name(KQV, "KQV"); +#else + // make V contiguous in memory to speed up the matmul, however we waste time on the copy + // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation + // is there a better way? + struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); +#endif + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + ggml_set_name(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + offload_func_v(cur); + ggml_set_name(cur, "KQV_merged_contiguous"); + + // projection (no bias) + cur = ggml_mul_mat(ctx0, + model.layers[il].wo, + cur); + offload_func(cur); + ggml_set_name(cur, "result_wo"); + } + + lctx.use_buf(ctx0, 1); + + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + offload_func(inpFF); + ggml_set_name(inpFF, "inpFF"); + + // feed-forward network + { + // norm + { + cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); + offload_func(cur); + ggml_set_name(cur, "rms_norm_1"); + + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + offload_func(cur); + ggml_set_name(cur, "ffn_norm"); + } + + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model.layers[il].w3, + cur); + offload_func(tmp); + ggml_set_name(tmp, "result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w1, + cur); + offload_func(cur); + ggml_set_name(cur, "result_w1"); + + // SILU activation + cur = ggml_silu(ctx0, cur); + offload_func(cur); + ggml_set_name(cur, "silu"); + + cur = ggml_mul(ctx0, cur, tmp); + offload_func(cur); + ggml_set_name(cur, "silu_x_result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w2, + cur); + offload_func(cur); + ggml_set_name(cur, "result_w2"); + } + + cur = ggml_add(ctx0, cur, inpFF); + offload_func(cur); + ggml_set_name(cur, "inpFF_+_result_w2"); + + // input for next layer + inpL = cur; + } + + lctx.use_buf(ctx0, 0); + + // norm + { + cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + offload_func_nr(cur); + ggml_set_name(cur, "rms_norm_2"); + + // cur = cur*norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.norm); + // offload_func_nr(cur); // TODO CPU + GPU mirrored backend + ggml_set_name(cur, "result_norm"); + } + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + lctx.use_buf(ctx0, -1); + + // logits -> probs + //cur = ggml_soft_max_inplace(ctx0, cur); + + ggml_build_forward_expand(gf, cur); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + +#if 0 + LLAMA_V3_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, + ggml_used_mem(ctx0)/1024.0/1024.0, + lctx.get_buf_max_mem(0)/1024.0/1024.0, + lctx.get_buf_max_mem(1)/1024.0/1024.0, + lctx.work_buffer.size()/1024.0/1024.0, + n_past, N); +#endif + + ggml_free(ctx0); + + return gf; +} + +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - embd embeddings input +// - n_tokens number of tokens +// - n_past: the context size so far +// - n_threads: number of threads to use +// +static bool llama_v3_eval_internal( + llama_v3_context & lctx, + const llama_v3_token * tokens, + const float * embd, + int n_tokens, + int n_past, + int n_threads, + const char * cgraph_fname) { + + LLAMA_V3_ASSERT((!tokens && embd) || (tokens && !embd)); + + LLAMA_V3_ASSERT(n_tokens > 0); + LLAMA_V3_ASSERT(n_past >= 0); + LLAMA_V3_ASSERT(n_threads > 0); + // TODO: keep the values of n_batch and n_ctx + // LLAMA_V3_ASSERT(n_tokens <= n_batch); + // LLAMA_V3_ASSERT(n_past + n_tokens <= n_ctx); + + const int64_t t_start_us = ggml_time_us(); + +#ifdef GGML_USE_MPI + ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); +#endif + + const int N = n_tokens; + + const auto & model = lctx.model; + const auto & hparams = model.hparams; + + const auto & kv_self = lctx.kv_self; + + LLAMA_V3_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_vocab = hparams.n_vocab; + +#ifdef LLAMA_V3_USE_ALLOCATOR + ggml_allocr_reset(lctx.alloc); +#endif + + ggml_cgraph * gf = llama_v3_build_graph(lctx, tokens, embd, n_tokens, n_past); + +#ifdef LLAMA_V3_USE_ALLOCATOR + ggml_allocr_alloc_graph(lctx.alloc, gf); +#endif + + // LLAMA_V3_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; + + LLAMA_V3_ASSERT(strcmp(res->name, "result_output") == 0); + LLAMA_V3_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + +#if GGML_USE_MPI + const int64_t n_layer = hparams.n_layer; + ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); +#endif + +#ifdef GGML_USE_METAL + if (lctx.ctx_metal) { + ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); + ggml_metal_graph_compute(lctx.ctx_metal, gf); + ggml_metal_get_tensor (lctx.ctx_metal, res); + if (!lctx.embedding.empty()) { + ggml_metal_get_tensor(lctx.ctx_metal, embeddings); + } + } else { + llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads); + } +#else + llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads); +#endif + +#if GGML_USE_MPI + ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); +#endif + + // update kv token count + lctx.kv_self.n = n_past + N; + + if (cgraph_fname) { + ggml_graph_export(gf, cgraph_fname); + } + +#ifdef GGML_PERF + // print timing information per ggml operation (for debugging purposes) + // requires GGML_PERF to be defined + ggml_graph_print(gf); +#endif + + // plot the computation graph in dot format (for debugging purposes) + //if (n_past%100 == 0) { + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); + //} + + // extract logits + { + auto & logits_out = lctx.logits; + + if (lctx.logits_all) { + logits_out.resize(n_vocab * N); + memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N); + } else { + // return result for just the last token + logits_out.resize(n_vocab); + memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + } + } + + // extract embeddings + if (!lctx.embedding.empty()) { + auto & embedding_out = lctx.embedding; + + embedding_out.resize(n_embd); + memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + } + + // measure the performance only for the single-token evals + if (N == 1) { + lctx.t_eval_us += ggml_time_us() - t_start_us; + lctx.n_eval++; + } + else if (N > 1) { + lctx.t_p_eval_us += ggml_time_us() - t_start_us; + lctx.n_p_eval += N; + } + + return true; +} + +// +// tokenizer +// + +static size_t utf8_len3(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +struct llama_v3_sp_symbol { + using index = int; + index prev; + index next; + const char * text; + size_t n; +}; + +static_assert(std::is_trivially_copyable::value, "llama_v3_sp_symbol is not trivially copyable"); + +struct llama_v3_sp_bigram { + struct comparator { + bool operator()(llama_v3_sp_bigram & l, llama_v3_sp_bigram & r) { + return (l.score < r.score) || (l.score == r.score && l.left > r.left); + } + }; + using queue_storage = std::vector; + using queue = std::priority_queue; + llama_v3_sp_symbol::index left; + llama_v3_sp_symbol::index right; + float score; + size_t size; +}; + +// original implementation: +// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 +struct llama_v3_tokenizer { + llama_v3_tokenizer(const llama_v3_vocab & vocab): vocab_(vocab) {} + + void tokenize(const std::string & text, std::vector & output) { + // split string into utf8 chars + int index = 0; + size_t offs = 0; + while (offs < text.size()) { + llama_v3_sp_symbol sym; + size_t char_len = std::min(text.size() - offs, utf8_len3(text[offs])); + sym.text = text.c_str() + offs; + sym.n = char_len; + offs += char_len; + sym.prev = index - 1; + sym.next = offs == text.size() ? -1 : index + 1; + index++; + symbols_.emplace_back(sym); + } + + // seed the work queue with all possible 2-character tokens. + for (size_t i = 1; i < symbols_.size(); ++i) { + try_add_bigram(i - 1, i); + } + + // keep substituting the highest frequency pairs for as long as we can. + while (!work_queue_.empty()) { + auto bigram = work_queue_.top(); + work_queue_.pop(); + + auto & left_sym = symbols_[bigram.left]; + auto & right_sym = symbols_[bigram.right]; + + // if one of the symbols already got merged, skip it. + if (left_sym.n == 0 || right_sym.n == 0 || + left_sym.n + right_sym.n != bigram.size) { + continue; + } + + // merge the right sym into the left one + left_sym.n += right_sym.n; + right_sym.n = 0; + + //LLAMA_V3_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); + + // remove the right sym from the chain + left_sym.next = right_sym.next; + if (right_sym.next >= 0) { + symbols_[right_sym.next].prev = bigram.left; + } + + // find more substitutions + try_add_bigram(left_sym.prev, bigram.left); + try_add_bigram(bigram.left, left_sym.next); + } + + for (int i = 0; i != -1; i = symbols_[i].next) { + auto & symbol = symbols_[i]; + auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n)); + + if (token == vocab_.token_to_id.end()) { + // output any symbols that did not form tokens as bytes. + for (int j = 0; j < (int) symbol.n; ++j) { + // NOTE: old version, before #2420 - not sure what are the implications of this + //llama_v3_vocab::id token_id = static_cast(symbol.text[j]) + 3; + llama_v3_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j])); + output.push_back(token_id); + } + } else { + output.push_back((*token).second); + } + } + } + +private: + void try_add_bigram(int left, int right) { + if (left == -1 || right == -1) { + return; + } + + const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n); + auto token = vocab_.token_to_id.find(text); + + if (token == vocab_.token_to_id.end()) { + return; + } + + if (static_cast((*token).second) >= vocab_.id_to_token.size()) { + return; + } + + const auto &tok_score = vocab_.id_to_token[(*token).second]; + + llama_v3_sp_bigram bigram; + bigram.left = left; + bigram.right = right; + bigram.score = tok_score.score; + bigram.size = text.size(); + work_queue_.push(bigram); + } + + const llama_v3_vocab & vocab_; + std::vector symbols_; + llama_v3_sp_bigram::queue work_queue_; +}; + +std::vector llama_v3_tokenize( + struct llama_v3_context * ctx, + const std::string & text, + bool add_bos) { + // upper limit for the number of tokens + int n_tokens = text.length() + add_bos; + std::vector result(n_tokens); + n_tokens = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); + if (n_tokens < 0) { + result.resize(-n_tokens); + int check = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); + GGML_ASSERT(check == -n_tokens); + } else { + result.resize(n_tokens); + } + return result; +} + +static std::vector llama_v3_tokenize(const llama_v3_vocab & vocab, const std::string & text, bool bos) { + llama_v3_tokenizer tokenizer(vocab); + std::vector output; + + if (text.empty()) { + return output; + } + + if (bos) { + output.push_back(llama_v3_token_bos()); + } + + tokenizer.tokenize(text, output); + return output; +} + +// +// grammar - internal +// + +struct llama_v3_partial_utf8 { + uint32_t value; // bit value so far (unshifted) + int n_remain; // num bytes remaining; -1 indicates invalid sequence +}; + +struct llama_v3_grammar { + const std::vector> rules; + std::vector> stacks; + + // buffer for partially generated UTF-8 sequence from accepted tokens + llama_v3_partial_utf8 partial_utf8; +}; + +struct llama_v3_grammar_candidate { + size_t index; + const uint32_t * code_points; + llama_v3_partial_utf8 partial_utf8; +}; + +// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as +// pointer. If an invalid sequence is encountered, returns `llama_v3_partial_utf8.n_remain == -1`. +std::pair, llama_v3_partial_utf8> decode_utf8( + const char * src, + llama_v3_partial_utf8 partial_start) { + static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; + const char * pos = src; + std::vector code_points; + uint32_t value = partial_start.value; + int n_remain = partial_start.n_remain; + + // continue previous decode, if applicable + while (*pos != 0 && n_remain > 0) { + uint8_t next_byte = static_cast(*pos); + if ((next_byte >> 6) != 2) { + // invalid sequence, abort + code_points.push_back(0); + return std::make_pair(std::move(code_points), llama_v3_partial_utf8{ 0, -1 }); + } + value = (value << 6) + (next_byte & 0x3F); + ++pos; + --n_remain; + } + + if (partial_start.n_remain > 0 && n_remain == 0) { + code_points.push_back(value); + } + + // decode any subsequent utf-8 sequences, which may end in an incomplete one + while (*pos != 0) { + uint8_t first_byte = static_cast(*pos); + uint8_t highbits = first_byte >> 4; + n_remain = lookup[highbits] - 1; + + if (n_remain < 0) { + // invalid sequence, abort + code_points.clear(); + code_points.push_back(0); + return std::make_pair(std::move(code_points), llama_v3_partial_utf8{ 0, n_remain }); + } + + uint8_t mask = (1 << (7 - n_remain)) - 1; + value = first_byte & mask; + ++pos; + while (*pos != 0 && n_remain > 0) { + value = (value << 6) + (static_cast(*pos) & 0x3F); + ++pos; + --n_remain; + } + if (n_remain == 0) { + code_points.push_back(value); + } + } + code_points.push_back(0); + + return std::make_pair(std::move(code_points), llama_v3_partial_utf8{ value, n_remain }); +} + +// returns true iff pos points to the end of one of the definitions of a rule +static bool llama_v3_grammar_is_end_of_sequence(const llama_v3_grammar_element * pos) { + switch (pos->type) { + case LLAMA_V3_GRETYPE_END: return true; + case LLAMA_V3_GRETYPE_ALT: return true; + default: return false; + } +} + +// returns true iff chr satisfies the char range at pos (regular or inverse range) +// asserts that pos is pointing to a char range element +static std::pair llama_v3_grammar_match_char( + const llama_v3_grammar_element * pos, + const uint32_t chr) { + + bool found = false; + bool is_positive_char = pos->type == LLAMA_V3_GRETYPE_CHAR; + LLAMA_V3_ASSERT(is_positive_char || pos->type == LLAMA_V3_GRETYPE_CHAR_NOT); + + do { + if (pos[1].type == LLAMA_V3_GRETYPE_CHAR_RNG_UPPER) { + // inclusive range, e.g. [a-z] + found = found || (pos->value <= chr && chr <= pos[1].value); + pos += 2; + } else { + // exact char match, e.g. [a] or "a" + found = found || pos->value == chr; + pos += 1; + } + } while (pos->type == LLAMA_V3_GRETYPE_CHAR_ALT); + + return std::make_pair(found == is_positive_char, pos); +} + +// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char +// range at pos (regular or inverse range) +// asserts that pos is pointing to a char range element +static bool llama_v3_grammar_match_partial_char( + const llama_v3_grammar_element * pos, + const llama_v3_partial_utf8 partial_utf8) { + + bool is_positive_char = pos->type == LLAMA_V3_GRETYPE_CHAR; + LLAMA_V3_ASSERT(is_positive_char || pos->type == LLAMA_V3_GRETYPE_CHAR_NOT); + + uint32_t partial_value = partial_utf8.value; + int n_remain = partial_utf8.n_remain; + + // invalid sequence or 7-bit char split across 2 bytes (overlong) + if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) { + return false; + } + + // range of possible code points this partial UTF-8 sequence could complete to + uint32_t low = partial_value << (n_remain * 6); + uint32_t high = low | ((1 << (n_remain * 6)) - 1); + + if (low == 0) { + if (n_remain == 2) { + low = 1 << 11; + } else if (n_remain == 3) { + low = 1 << 16; + } + } + + do { + if (pos[1].type == LLAMA_V3_GRETYPE_CHAR_RNG_UPPER) { + // inclusive range, e.g. [a-z] + if (pos->value <= high && low <= pos[1].value) { + return is_positive_char; + } + pos += 2; + } else { + // exact char match, e.g. [a] or "a" + if (low <= pos->value && pos->value <= high) { + return is_positive_char; + } + pos += 1; + } + } while (pos->type == LLAMA_V3_GRETYPE_CHAR_ALT); + + return !is_positive_char; +} + + +// transforms a grammar pushdown stack into N possible stacks, all ending +// at a character range (terminal element) +static void llama_v3_grammar_advance_stack( + const std::vector> & rules, + const std::vector & stack, + std::vector> & new_stacks) { + + if (stack.empty()) { + new_stacks.push_back(stack); + return; + } + + const llama_v3_grammar_element * pos = stack.back(); + + switch (pos->type) { + case LLAMA_V3_GRETYPE_RULE_REF: { + const size_t rule_id = static_cast(pos->value); + const llama_v3_grammar_element * subpos = rules[rule_id].data(); + do { + // init new stack without the top (pos) + std::vector new_stack(stack.begin(), stack.end() - 1); + if (!llama_v3_grammar_is_end_of_sequence(pos + 1)) { + // if this rule ref is followed by another element, add that to stack + new_stack.push_back(pos + 1); + } + if (!llama_v3_grammar_is_end_of_sequence(subpos)) { + // if alternate is nonempty, add to stack + new_stack.push_back(subpos); + } + llama_v3_grammar_advance_stack(rules, new_stack, new_stacks); + while (!llama_v3_grammar_is_end_of_sequence(subpos)) { + // scan to end of alternate def + subpos++; + } + if (subpos->type == LLAMA_V3_GRETYPE_ALT) { + // there's another alternate def of this rule to process + subpos++; + } else { + break; + } + } while (true); + break; + } + case LLAMA_V3_GRETYPE_CHAR: + case LLAMA_V3_GRETYPE_CHAR_NOT: + new_stacks.push_back(stack); + break; + default: + // end of alternate (LLAMA_V3_GRETYPE_END, LLAMA_V3_GRETYPE_ALT) or middle of char range + // (LLAMA_V3_GRETYPE_CHAR_ALT, LLAMA_V3_GRETYPE_CHAR_RNG_UPPER); stack should never be left on + // those + LLAMA_V3_ASSERT(false); + } +} + +// takes a set of possible pushdown stacks on a grammar, which are required to +// be positioned at a character range (see `llama_v3_grammar_advance_stack`), and +// produces the N possible stacks if the given char is accepted at those +// positions +static std::vector> llama_v3_grammar_accept( + const std::vector> & rules, + const std::vector> & stacks, + const uint32_t chr) { + + std::vector> new_stacks; + + for (const auto & stack : stacks) { + if (stack.empty()) { + continue; + } + + auto match = llama_v3_grammar_match_char(stack.back(), chr); + if (match.first) { + const llama_v3_grammar_element * pos = match.second; + + // update top of stack to next element, if any + std::vector new_stack(stack.begin(), stack.end() - 1); + if (!llama_v3_grammar_is_end_of_sequence(pos)) { + new_stack.push_back(pos); + } + llama_v3_grammar_advance_stack(rules, new_stack, new_stacks); + } + } + + return new_stacks; +} + +static std::vector llama_v3_grammar_reject_candidates( + const std::vector> & rules, + const std::vector> & stacks, + const std::vector & candidates); + +static std::vector llama_v3_grammar_reject_candidates_for_stack( + const std::vector> & rules, + const std::vector & stack, + const std::vector & candidates) { + + std::vector rejects; + + if (stack.empty()) { + for (auto tok : candidates) { + if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) { + rejects.push_back(tok); + } + } + return rejects; + } + + const llama_v3_grammar_element * stack_pos = stack.back(); + + std::vector next_candidates; + for (auto tok : candidates) { + if (*tok.code_points == 0) { + // reached end of full codepoints in token, reject iff it ended in a partial sequence + // that cannot satisfy this position in grammar + if (tok.partial_utf8.n_remain != 0 && + !llama_v3_grammar_match_partial_char(stack_pos, tok.partial_utf8)) { + rejects.push_back(tok); + } + } else if (llama_v3_grammar_match_char(stack_pos, *tok.code_points).first) { + next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 }); + } else { + rejects.push_back(tok); + } + } + + auto stack_pos_after = llama_v3_grammar_match_char(stack_pos, 0).second; + + // update top of stack to next element, if any + std::vector stack_after(stack.begin(), stack.end() - 1); + if (!llama_v3_grammar_is_end_of_sequence(stack_pos_after)) { + stack_after.push_back(stack_pos_after); + } + std::vector> next_stacks; + llama_v3_grammar_advance_stack(rules, stack_after, next_stacks); + + auto next_rejects = llama_v3_grammar_reject_candidates(rules, next_stacks, next_candidates); + for (auto tok : next_rejects) { + rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 }); + } + + return rejects; +} + +static std::vector llama_v3_grammar_reject_candidates( + const std::vector> & rules, + const std::vector> & stacks, + const std::vector & candidates) { + LLAMA_V3_ASSERT(!stacks.empty()); // REVIEW + + if (candidates.empty()) { + return std::vector(); + } + + auto rejects = llama_v3_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates); + + for (size_t i = 1, size = stacks.size(); i < size; ++i) { + rejects = llama_v3_grammar_reject_candidates_for_stack(rules, stacks[i], rejects); + } + return rejects; +} + +// +// grammar - external +// + +struct llama_v3_grammar * llama_v3_grammar_init( + const llama_v3_grammar_element ** rules, + size_t n_rules, + size_t start_rule_index) { + const llama_v3_grammar_element * pos; + + // copy rule definitions into vectors + std::vector> vec_rules(n_rules); + for (size_t i = 0; i < n_rules; i++) { + for (pos = rules[i]; pos->type != LLAMA_V3_GRETYPE_END; pos++) { + vec_rules[i].push_back(*pos); + } + vec_rules[i].push_back({LLAMA_V3_GRETYPE_END, 0}); + } + + // loop over alternates of start rule to build initial stacks + std::vector> stacks; + pos = rules[start_rule_index]; + do { + std::vector stack; + if (!llama_v3_grammar_is_end_of_sequence(pos)) { + // if alternate is nonempty, add to stack + stack.push_back(pos); + } + llama_v3_grammar_advance_stack(vec_rules, stack, stacks); + while (!llama_v3_grammar_is_end_of_sequence(pos)) { + // scan to end of alternate def + pos++; + } + if (pos->type == LLAMA_V3_GRETYPE_ALT) { + // there's another alternate def of this rule to process + pos++; + } else { + break; + } + } while (true); + + return new llama_v3_grammar{ std::move(vec_rules), std::move(stacks), {} }; +} + +void llama_v3_grammar_free(struct llama_v3_grammar * grammar) { + delete grammar; +} + +// +// sampling +// + +void llama_v3_sample_softmax(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates) { + assert(candidates->size > 0); + + const int64_t t_start_sample_us = ggml_time_us(); + + // Sort the logits in descending order + if (!candidates->sorted) { + std::sort(candidates->data, candidates->data + candidates->size, [](const llama_v3_token_data & a, const llama_v3_token_data & b) { + return a.logit > b.logit; + }); + candidates->sorted = true; + } + + float max_l = candidates->data[0].logit; + float cum_sum = 0.0f; + for (size_t i = 0; i < candidates->size; ++i) { + float p = expf(candidates->data[i].logit - max_l); + candidates->data[i].p = p; + cum_sum += p; + } + for (size_t i = 0; i < candidates->size; ++i) { + candidates->data[i].p /= cum_sum; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_v3_sample_top_k(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, int k, size_t min_keep) { + const int64_t t_start_sample_us = ggml_time_us(); + + k = std::max(k, (int) min_keep); + k = std::min(k, (int) candidates->size); + + // Sort scores in descending order + if (!candidates->sorted) { + auto comp = [](const llama_v3_token_data & a, const llama_v3_token_data & b) { + return a.logit > b.logit; + }; + if (k == (int) candidates->size) { + std::sort(candidates->data, candidates->data + candidates->size, comp); + } else { + std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + } + candidates->sorted = true; + } + candidates->size = k; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_v3_sample_top_p(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float p, size_t min_keep) { + if (p >= 1.0f) { + return; + } + + llama_v3_sample_softmax(ctx, candidates); + + const int64_t t_start_sample_us = ggml_time_us(); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = candidates->size; + + for (size_t i = 0; i < candidates->size; ++i) { + cum_sum += candidates->data[i].p; + + // Check if the running sum is at least p or if we have kept at least min_keep tokens + // we set the last index to i+1 to indicate that the current iterate should be included in the set + if (cum_sum >= p && i + 1 >= min_keep) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the top-p tokens + candidates->size = last_idx; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_v3_sample_tail_free(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float z, size_t min_keep) { + if (z >= 1.0f || candidates->size <= 2) { + return; + } + + llama_v3_sample_softmax(nullptr, candidates); + const int64_t t_start_sample_us = ggml_time_us(); + + // Compute the first and second derivatives + std::vector first_derivatives(candidates->size - 1); + std::vector second_derivatives(candidates->size - 2); + + for (size_t i = 0; i < first_derivatives.size(); ++i) { + first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p; + } + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; + } + + // Calculate absolute value of second derivatives + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = abs(second_derivatives[i]); + } + + // Normalize the second derivatives + { + const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); + + if (second_derivatives_sum > 1e-6f) { + for (float & value : second_derivatives) { + value /= second_derivatives_sum; + } + } else { + for (float & value : second_derivatives) { + value = 1.0f / second_derivatives.size(); + } + } + } + + float cum_sum = 0.0f; + size_t last_idx = candidates->size; + for (size_t i = 0; i < second_derivatives.size(); ++i) { + cum_sum += second_derivatives[i]; + + // Check if the running sum is greater than z or if we have kept at least min_keep tokens + if (cum_sum > z && i >= min_keep) { + last_idx = i; + break; + } + } + + // Resize the output vector to keep only the tokens above the tail location + candidates->size = last_idx; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + + +void llama_v3_sample_typical(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float p, size_t min_keep) { + // Reference implementation: + // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr + if (p >= 1.0f) { + return; + } + + // Compute the softmax of logits and calculate entropy + llama_v3_sample_softmax(nullptr, candidates); + + const int64_t t_start_sample_us = ggml_time_us(); + + float entropy = 0.0f; + for (size_t i = 0; i < candidates->size; ++i) { + if(candidates->data[i].p>0) + { + entropy += -candidates->data[i].p * logf(candidates->data[i].p); + } + } + + // Compute the absolute difference between negative log probability and entropy for each candidate + std::vector shifted_scores; + for (size_t i = 0; i < candidates->size; ++i) { + float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy); + shifted_scores.push_back(shifted_score); + } + + // Sort tokens based on the shifted_scores and their corresponding indices + std::vector indices(candidates->size); + std::iota(indices.begin(), indices.end(), 0); + + std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { + return shifted_scores[a] < shifted_scores[b]; + }); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = indices.size(); + + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + cum_sum += candidates->data[idx].p; + + // Check if the running sum is greater than typical or if we have kept at least min_keep tokens + if (cum_sum > p && i >= min_keep - 1) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the locally typical tokens + std::vector new_candidates; + for (size_t i = 0; i < last_idx; ++i) { + size_t idx = indices[i]; + new_candidates.push_back(candidates->data[idx]); + } + + // Replace the data in candidates with the new_candidates data + std::copy(new_candidates.begin(), new_candidates.end(), candidates->data); + candidates->size = new_candidates.size(); + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_v3_sample_temperature(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates_p, float temp) { + const int64_t t_start_sample_us = ggml_time_us(); + + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= temp; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_v3_sample_repetition_penalty(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, const llama_v3_token * last_tokens, size_t last_tokens_size, float penalty) { + if (last_tokens_size == 0 || penalty == 1.0f) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + for (size_t i = 0; i < candidates->size; ++i) { + const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); + if (token_iter == last_tokens + last_tokens_size) { + continue; + } + + // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. + // This is common fix for this problem, which is to multiply by the penalty instead of dividing. + if (candidates->data[i].logit <= 0) { + candidates->data[i].logit *= penalty; + } else { + candidates->data[i].logit /= penalty; + } + } + + candidates->sorted = false; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_v3_sample_frequency_and_presence_penalties(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, const llama_v3_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { + if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + // Create a frequency map to count occurrences of each token in last_tokens + std::unordered_map token_count; + for (size_t i = 0; i < last_tokens_size; ++i) { + token_count[last_tokens_p[i]]++; + } + + // Apply frequency and presence penalties to the candidates + for (size_t i = 0; i < candidates->size; ++i) { + auto token_iter = token_count.find(candidates->data[i].id); + if (token_iter == token_count.end()) { + continue; + } + + int count = token_iter->second; + candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence; + } + + candidates->sorted = false; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_v3_sample_grammar(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, const struct llama_v3_grammar * grammar) { + assert(ctx); + const int64_t t_start_sample_us = ggml_time_us(); + + bool allow_eos = false; + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + allow_eos = true; + break; + } + } + + const llama_v3_token eos = llama_v3_token_eos(); + + std::vector, llama_v3_partial_utf8>> candidates_decoded; + std::vector candidates_grammar; + + for (size_t i = 0; i < candidates->size; ++i) { + const llama_v3_token id = candidates->data[i].id; + const char * str = llama_v3_token_to_str(ctx, id); + if (id == eos) { + if (!allow_eos) { + candidates->data[i].logit = -INFINITY; + } + } else if (*str == 0) { + candidates->data[i].logit = -INFINITY; + } else { + candidates_decoded.push_back(decode_utf8(str, grammar->partial_utf8)); + candidates_grammar.push_back({ + i, candidates_decoded.back().first.data(), candidates_decoded.back().second + }); + } + } + + const auto rejects = + llama_v3_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); + for (auto & reject : rejects) { + candidates->data[reject.index].logit = -INFINITY; + } + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; +} + +static void llama_v3_log_softmax(float * array, size_t size) { + float max_l = *std::max_element(array, array + size); + float sum = 0.f; + for (size_t i = 0; i < size; ++i) { + float p = expf(array[i] - max_l); + sum += p; + array[i] = p; + } + + for (size_t i = 0; i < size; ++i) { + array[i] = logf(array[i] / sum); + } +} + +void llama_v3_sample_classifier_free_guidance( + struct llama_v3_context * ctx, + llama_v3_token_data_array * candidates, + struct llama_v3_context * guidance_ctx, + float scale) { + int64_t t_start_sample_us = ggml_time_us(); + + assert(ctx); + auto n_vocab = llama_v3_n_vocab(ctx); + assert(n_vocab == (int)candidates->size); + assert(!candidates->sorted); + + std::vector logits_base; + logits_base.reserve(candidates->size); + for (size_t i = 0; i < candidates->size; ++i) { + logits_base.push_back(candidates->data[i].logit); + } + llama_v3_log_softmax(logits_base.data(), candidates->size); + + float* logits_guidance = llama_v3_get_logits(guidance_ctx); + llama_v3_log_softmax(logits_guidance, n_vocab); + + for (int i = 0; i < n_vocab; ++i) { + float logit_guidance = logits_guidance[i]; + float logit_base = logits_base[i]; + candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +llama_v3_token llama_v3_sample_token_mirostat(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float tau, float eta, int m, float * mu) { + assert(ctx); + auto N = float(llama_v3_n_vocab(ctx)); + int64_t t_start_sample_us; + t_start_sample_us = ggml_time_us(); + + llama_v3_sample_softmax(nullptr, candidates); + + // Estimate s_hat using the most probable m tokens + float s_hat = 0.0; + float sum_ti_bi = 0.0; + float sum_ti_sq = 0.0; + for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) { + float t_i = logf(float(i + 2) / float(i + 1)); + float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p); + sum_ti_bi += t_i * b_i; + sum_ti_sq += t_i * t_i; + } + s_hat = sum_ti_bi / sum_ti_sq; + + // Compute k from the estimated s_hat and target surprise value + float epsilon_hat = s_hat - 1; + float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat); + + // Sample the next word X using top-k sampling + llama_v3_sample_top_k(nullptr, candidates, int(k), 1); + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + llama_v3_token X = llama_v3_sample_token(ctx, candidates); + t_start_sample_us = ggml_time_us(); + + // Compute error as the difference between observed surprise and target surprise value + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_v3_token_data & candidate) { + return candidate.id == X; + })); + float observed_surprise = -log2f(candidates->data[X_idx].p); + float e = observed_surprise - tau; + + // Update mu using the learning rate and error + *mu = *mu - eta * e; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + return X; +} + +llama_v3_token llama_v3_sample_token_mirostat_v2(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float tau, float eta, float * mu) { + int64_t t_start_sample_us; + t_start_sample_us = ggml_time_us(); + + llama_v3_sample_softmax(ctx, candidates); + + // Truncate the words with surprise values greater than mu + candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_v3_token_data & candidate) { + return -log2f(candidate.p) > *mu; + })); + + if (candidates->size == 0) { + candidates->size = 1; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // Normalize the probabilities of the remaining words + llama_v3_sample_softmax(ctx, candidates); + + // Sample the next word X from the remaining words + llama_v3_token X = llama_v3_sample_token(ctx, candidates); + t_start_sample_us = ggml_time_us(); + + // Compute error as the difference between observed surprise and target surprise value + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_v3_token_data & candidate) { + return candidate.id == X; + })); + float observed_surprise = -log2f(candidates->data[X_idx].p); + float e = observed_surprise - tau; + + // Update mu using the learning rate and error + *mu = *mu - eta * e; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + return X; +} + +llama_v3_token llama_v3_sample_token_greedy(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates) { + const int64_t t_start_sample_us = ggml_time_us(); + + // Find max element + auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_v3_token_data & a, const llama_v3_token_data & b) { + return a.logit < b.logit; + }); + + llama_v3_token result = max_iter->id; + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + } + return result; +} + +llama_v3_token llama_v3_sample_token(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates) { + assert(ctx); + const int64_t t_start_sample_us = ggml_time_us(); + llama_v3_sample_softmax(nullptr, candidates); + + std::vector probs; + probs.reserve(candidates->size); + for (size_t i = 0; i < candidates->size; ++i) { + probs.push_back(candidates->data[i].p); + } + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + auto & rng = ctx->rng; + int idx = dist(rng); + + llama_v3_token result = candidates->data[idx].id; + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + return result; +} + +void llama_v3_grammar_accept_token(struct llama_v3_context * ctx, struct llama_v3_grammar * grammar, llama_v3_token token) { + const int64_t t_start_sample_us = ggml_time_us(); + + if (token == llama_v3_token_eos()) { + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + return; + } + } + LLAMA_V3_ASSERT(false); + } + + const char * str = llama_v3_token_to_str(ctx, token); + + // Note terminating 0 in decoded string + const auto decoded = decode_utf8(str, grammar->partial_utf8); + const auto & code_points = decoded.first; + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + grammar->stacks = llama_v3_grammar_accept(grammar->rules, grammar->stacks, *it); + } + grammar->partial_utf8 = decoded.second; + LLAMA_V3_ASSERT(!grammar->stacks.empty()); + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; +} + +// +// quantization +// + +static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor, llama_v3_buffer & output, const int nelements, const int nthread) { + if (output.size < nelements * sizeof(float)) { + output.resize(nelements * sizeof(float)); + } + float * f32_output = (float *) output.addr; + + ggml_type_traits_t qtype; + if (ggml_is_quantized(tensor.type)) { + qtype = ggml_internal_get_type_traits(tensor.type); + if (qtype.to_float == NULL) { + throw std::runtime_error(format_old("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); + } + } else if (tensor.type != GGML_TYPE_F16) { + throw std::runtime_error(format_old("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); + } + + if (nthread < 2) { + if (tensor.type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements); + } else if (ggml_is_quantized(tensor.type)) { + qtype.to_float(tensor.data, f32_output, nelements); + } else { + LLAMA_V3_ASSERT(false); // unreachable + } + return; + } + + auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type); + auto block_size_bytes = ggml_type_size(tensor.type); + + LLAMA_V3_ASSERT(nelements % block_size == 0); + auto nblocks = nelements / block_size; + auto blocks_per_thread = nblocks / nthread; + auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count + + std::vector workers; + for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) { + auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread + auto thr_elems = thr_blocks * block_size; // number of elements for this thread + auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread + + auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { + if (typ == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); + } else { + qtype.to_float(inbuf, outbuf, nels); + } + }; + workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); + in_buff_offs += thr_block_bytes; + out_buff_offs += thr_elems; + } + for (auto & worker : workers) { + worker.join(); + } + +} + +static void llama_v3_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_v3_model_quantize_params * params) { + ggml_type quantized_type; + llama_v3_ftype ftype = params->ftype; + int nthread = params->nthread; + + switch (params->ftype) { + case LLAMA_V3_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; + case LLAMA_V3_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; + case LLAMA_V3_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; + case LLAMA_V3_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; + case LLAMA_V3_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; + case LLAMA_V3_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; + case LLAMA_V3_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; + +#ifdef GGML_USE_K_QUANTS + // K-quants + case LLAMA_V3_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; + case LLAMA_V3_FTYPE_MOSTLY_Q3_K_S: + case LLAMA_V3_FTYPE_MOSTLY_Q3_K_M: + case LLAMA_V3_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; + case LLAMA_V3_FTYPE_MOSTLY_Q4_K_S: + case LLAMA_V3_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; + case LLAMA_V3_FTYPE_MOSTLY_Q5_K_S: + case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; + case LLAMA_V3_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; +#endif + default: throw std::runtime_error(format_old("invalid output file type %d\n", ftype)); + } + + if (nthread <= 0) { + nthread = std::thread::hardware_concurrency(); + } + + std::unique_ptr model_loader(new llama_v3_model_loader(fname_inp, /*use_mmap*/ false)); + llama_v3_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype); + +#ifdef GGML_USE_K_QUANTS + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + for (auto& tensor : model_loader->tensors_map.tensors) { + if (tensor.name.find("attention.wv.weight") != std::string::npos) { + ++n_attention_wv; + } + else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + ++n_feed_forward_w2; + } + } + + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; +#endif + + size_t total_size_org = 0; + size_t total_size_new = 0; + std::vector hist_all(1 << 4, 0); + + std::vector workers; + std::mutex mutex; + + auto use_more_bits = [] (int i_layer, int num_layers) -> bool { + return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; + }; + + size_t idx = 0; + for (llama_v3_load_tensor & tensor : model_loader->tensors_map.tensors) { + llama_v3_buffer read_data; + read_data.resize(tensor.size); + tensor.data = read_data.addr; + model_loader->load_data_for(tensor); + + LLAMA_V3_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ", + ++idx, model_loader->tensors_map.tensors.size(), + tensor.name.c_str(), llama_v3_format_tensor_shape(tensor.ne).c_str(), + ggml_type_name(tensor.type)); + + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? + + // quantize only 2D tensors + quantize &= (tensor.ne.size() == 2); + quantize &= params->quantize_output_tensor || tensor.name != "output.weight"; + quantize &= quantized_type != tensor.type; + + enum ggml_type new_type; + void * new_data; + size_t new_size; + llama_v3_buffer work; + + if (!quantize) { + new_type = tensor.type; + new_data = tensor.data; + new_size = tensor.size; + LLAMA_V3_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0); + } else { + new_type = quantized_type; +#ifdef GGML_USE_K_QUANTS + if (tensor.name == "output.weight") { + int nx = tensor.ne.at(0); + int ny = tensor.ne.at(1); + if (nx % QK_K == 0 && ny % QK_K == 0) { + new_type = GGML_TYPE_Q6_K; + } + } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { + if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if ((ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q5_K_M) && + use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (QK_K == 64 && (ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_S) && + (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; + ++i_attention_wv; + } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if ((ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q5_K_M) && + use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + //else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; + ++i_feed_forward_w2; + } else if (tensor.name.find("attention.wo.weight") != std::string::npos) { + if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_V3_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_V3_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + } + bool convert_incompatible_tensor = false; + if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || + new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { + int nx = tensor.ne.at(0); + int ny = tensor.ne.at(1); + if (nx % QK_K != 0 || ny % QK_K != 0) { + LLAMA_V3_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); + convert_incompatible_tensor = true; + } + } + if (convert_incompatible_tensor) { + if (tensor.name == "output.weight") { + new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. + LLAMA_V3_LOG_WARN("F16 will be used for this tensor instead.\n"); + } else if (tensor.name == "tok_embeddings.weight") { + new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. + LLAMA_V3_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); + } else { + throw std::runtime_error("Unsupported tensor size encountered\n"); + } + } +#endif + + float * f32_data; + size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); + llama_v3_buffer f32_conv_buf; + + if (tensor.type == GGML_TYPE_F32) { + f32_data = (float *) tensor.data; + } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) { + throw std::runtime_error(format_old("requantizing from type %s is disabled", ggml_type_name(tensor.type))); + } else { + llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); + f32_data = (float *) f32_conv_buf.addr; + } + + LLAMA_V3_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type)); + fflush(stdout); + + work.resize(nelements * 4); // upper bound on size + new_data = work.addr; + std::vector hist_cur(1 << 4, 0); + + int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1)/chunk_size; + const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () { + std::vector local_hist; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + size_t first = counter; counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j=0; j %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); + int64_t tot_count = 0; + for (size_t i = 0; i < hist_cur.size(); i++) { + hist_all[i] += hist_cur[i]; + tot_count += hist_cur[i]; + } + + if (tot_count > 0) { + for (size_t i = 0; i < hist_cur.size(); i++) { + LLAMA_V3_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements)); + } + } + LLAMA_V3_LOG_INFO("\n"); + } + total_size_org += tensor.size; + total_size_new += new_size; + file_saver.write_tensor(tensor, new_type, new_data, new_size); + } + + LLAMA_V3_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + LLAMA_V3_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); + + { + int64_t sum_all = 0; + for (size_t i = 0; i < hist_all.size(); i++) { + sum_all += hist_all[i]; + } + + if (sum_all > 0) { + LLAMA_V3_LOG_INFO("%s: hist: ", __func__); + for (size_t i = 0; i < hist_all.size(); i++) { + LLAMA_V3_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all)); + } + LLAMA_V3_LOG_INFO("\n"); + } + } +} + + + +// +// interface implementation +// + +struct llama_v3_model * llama_v3_load_model_from_file( + const char * path_model, + struct llama_v3_context_params params) { + ggml_time_init(); + + llama_v3_model * model = new llama_v3_model; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + if (!llama_v3_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers, + params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram, + memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, + params.progress_callback_user_data)) { + LLAMA_V3_LOG_ERROR("%s: failed to load model\n", __func__); + delete model; + return nullptr; + } + + return model; +} + +void llama_v3_free_model(struct llama_v3_model * model) { + delete model; +} + +struct llama_v3_context * llama_v3_new_context_with_model( + struct llama_v3_model * model, + struct llama_v3_context_params params) { + + if (!model) { + return nullptr; + } + + llama_v3_context * ctx = new llama_v3_context(*model); + + if (params.seed == LLAMA_V3_DEFAULT_SEED) { + params.seed = time(NULL); + } + + size_t blasbatchmul = get_blas_batch_mul3(params.n_batch); + + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + *cur_percentage_p = percentage; + LLAMA_V3_LOG_INFO("."); + if (percentage >= 100) { + LLAMA_V3_LOG_INFO("\n"); + } + } + }; + } + + ctx->rng = std::mt19937(params.seed); + ctx->logits_all = params.logits_all; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + // reserve memory for context buffers + if (!params.vocab_only) { + if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { + LLAMA_V3_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__); + llama_v3_free(ctx); + return nullptr; + } + + { + const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); + LLAMA_V3_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + } + + const auto & hparams = ctx->model.hparams; + + // resized during inference + if (params.logits_all) { + ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); + } else { + ctx->logits.reserve(hparams.n_vocab); + } + + if (params.embedding){ + ctx->embedding.resize(hparams.n_embd); + } + +#ifdef LLAMA_V3_USE_ALLOCATOR + { + static const size_t tensor_alignment = 32; + // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data + ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + + // create measure allocator + ctx->alloc = ggml_allocr_new_measure(tensor_alignment); + + // build worst-case graph + int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); + int n_past = hparams.n_ctx - n_tokens; + llama_v3_token token = llama_v3_token_bos(); // not actually used by llama_v3_build_graph, but required to choose between token and embedding inputs graph + ggml_cgraph * gf = llama_v3_build_graph(*ctx, &token, NULL, n_tokens, n_past); +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + ctx->ctx_metal = ggml_metal_init(1); + if (!ctx->ctx_metal) { + LLAMA_V3_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); + llama_v3_free(ctx); + return NULL; + } + ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); + ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); + } +#endif + // measure memory requirements for the graph + size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; + + LLAMA_V3_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + + // debug - for comparison with scratch buffer + //size_t prev_req = + // MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type) + + // MEM_REQ_SCRATCH1_3().at(ctx->model.type) + + // MEM_REQ_EVAL_3().at(ctx->model.type); + //LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); + + // recreate allocator with exact memory requirements + ggml_allocr_free(ctx->alloc); + + ctx->buf_alloc.resize(alloc_size); + ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); +#ifdef GGML_USE_METAL + if (ctx->ctx_metal) { + ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); + } +#endif + } +#else + ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL_3().at(ctx->model.type) + ggml_graph_overhead()); +#endif + +#ifdef LLAMA_V3_USE_SCRATCH + ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type)); + ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1_3().at(ctx->model.type)); +#endif + } + +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + // this allocates all Metal resources and memory buffers + + void * data_ptr = NULL; + size_t data_size = 0; + + if (params.use_mmap) { + data_ptr = ctx->model.mapping->addr; + data_size = ctx->model.mapping->size; + } else { + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size = ggml_get_mem_size (ctx->model.ctx); + } + + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); + + LLAMA_V3_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + +#define LLAMA_V3_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + LLAMA_V3_LOG_ERROR("%s: failed to add buffer\n", __func__); \ + llama_v3_free(ctx); \ + return NULL; \ + } + + LLAMA_V3_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); + + LLAMA_V3_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); + LLAMA_V3_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); + + LLAMA_V3_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0)); +#undef LLAMA_V3_METAL_CHECK_BUF + } +#endif + +#ifdef GGML_USE_MPI + ctx->ctx_mpi = ggml_mpi_init(); + + if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { + // Enter a blocking eval loop with dummy input, letting rank=0 drive the process + const std::vector tmp(ctx->model.hparams.n_ctx, llama_v3_token_bos()); + while (!llama_v3_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {}; + llama_v3_backend_free(); + exit(1); + } +#endif + + return ctx; +} + +struct llama_v3_context * llama_v3_init_from_file( + const char * path_model, + struct llama_v3_context_params params) { + + struct llama_v3_model * model = llama_v3_load_model_from_file(path_model, params); + if (!model) { + return nullptr; + } + struct llama_v3_context * ctx = llama_v3_new_context_with_model(model, params); + ctx->model_owner = true; + return ctx; +} + +void llama_v3_free(struct llama_v3_context * ctx) { + delete ctx; +} + +int llama_v3_model_quantize( + const char * fname_inp, + const char * fname_out, + const llama_v3_model_quantize_params *params) { + try { + llama_v3_model_quantize_internal(fname_inp, fname_out, params); + return 0; + } catch (const std::exception & err) { + LLAMA_V3_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model, const char * path_lora, const char * path_base_model, int n_threads) { + LLAMA_V3_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); + + const int64_t t_start_lora_us = ggml_time_us(); + + auto fin = std::ifstream(path_lora, std::ios::binary); + if (!fin) { + LLAMA_V3_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora); + return 1; + } + + // verify magic and version + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != LLAMA_V3_FILE_MAGIC_GGLA) { + LLAMA_V3_LOG_ERROR("%s: bad file magic\n", __func__); + return 1; + } + uint32_t format_version; + fin.read((char *) &format_version, sizeof(format_version)); + + if (format_version != 1) { + LLAMA_V3_LOG_ERROR("%s: unsupported file version\n", __func__ ); + return 1; + } + } + + int32_t lora_r; + int32_t lora_alpha; + fin.read((char *) &lora_r, sizeof(lora_r)); + fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + float scaling = (float)lora_alpha / (float)lora_r; + + LLAMA_V3_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + + + // create a temporary ggml context to store the lora tensors + // todo: calculate size from biggest possible tensor + std::vector lora_buf(1024ull * 1024ull * 1024ull); + struct ggml_init_params params; + params.mem_size = lora_buf.size(); + params.mem_buffer = lora_buf.data(); + params.no_alloc = false; + + ggml_context * lora_ctx = ggml_init(params); + std::unordered_map lora_tensors; + + // create a name -> tensor map of the model to accelerate lookups + std::unordered_map model_tensors; + for (const auto & kv: model.tensors_by_name) { + model_tensors.insert(kv); + } + + + // load base model + std::unique_ptr model_loader; + ggml_context * base_ctx = NULL; + llama_v3_buffer base_buf; + if (path_base_model) { + LLAMA_V3_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); + model_loader.reset(new llama_v3_model_loader(path_base_model, /*use_mmap*/ true)); + + size_t ctx_size; + size_t mmapped_size; + model_loader->calc_sizes(&ctx_size, &mmapped_size); + base_buf.resize(ctx_size); + + ggml_init_params base_params; + base_params.mem_size = base_buf.size; + base_params.mem_buffer = base_buf.addr; + base_params.no_alloc = model_loader->use_mmap; + + base_ctx = ggml_init(base_params); + + model_loader->ggml_ctx = base_ctx; + + // maybe this should in llama_v3_model_loader + if (model_loader->use_mmap) { + model_loader->mapping.reset(new llama_v3_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa())); + } + } + + // read tensors and apply + bool warned = false; + int n_tensors = 0; + + std::vector work_buffer; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ftype), sizeof(ftype)); + if (fin.eof()) { + break; + } + + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + } + + std::string name; + { + char buf[1024]; + fin.read(buf, length); + name = std::string(buf, length); + } + + // check for lora suffix and get the type of tensor + const std::string lora_suffix = ".lora"; + size_t pos = name.rfind(lora_suffix); + if (pos == std::string::npos) { + LLAMA_V3_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); + return 1; + } + + std::string lora_type = name.substr(pos + lora_suffix.length()); + std::string base_name = name; + base_name.erase(pos); + // LLAMA_V3_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + + if (model_tensors.find(base_name) == model_tensors.end()) { + LLAMA_V3_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + return 1; + } + + // create ggml tensor + ggml_type wtype; + switch (ftype) { + case 0: wtype = GGML_TYPE_F32; break; + case 1: wtype = GGML_TYPE_F16; break; + default: + { + LLAMA_V3_LOG_ERROR("%s: invalid tensor data type '%d'\n", + __func__, ftype); + return false; + } + } + ggml_tensor * lora_tensor; + if (n_dims == 2) { + lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); + } + else { + LLAMA_V3_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; + } + ggml_set_name(lora_tensor, "lora_tensor"); + + // load tensor data + size_t offset = fin.tellg(); + size_t tensor_data_size = ggml_nbytes(lora_tensor); + offset = (offset + 31) & -32; + fin.seekg(offset); + fin.read((char*)lora_tensor->data, tensor_data_size); + + lora_tensors[name] = lora_tensor; + + // check if we have both A and B tensors and apply + if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && + lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + + ggml_tensor * dest_t = model_tensors[base_name]; + + offload_func_t offload_func = llama_v3_nop; + offload_func_t offload_func_force_inplace = llama_v3_nop; + +#ifdef GGML_USE_CUBLAS + if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { + if (dest_t->type != GGML_TYPE_F16) { + throw std::runtime_error(format_old( + "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); + } + offload_func = ggml_cuda_assign_buffers; + offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; + } +#endif // GGML_USE_CUBLAS + + ggml_tensor * base_t; + if (model_loader) { + // load from base model + if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { + LLAMA_V3_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; + } + size_t idx = model_loader->tensors_map.name_to_idx[base_name]; + llama_v3_load_tensor & lt = model_loader->tensors_map.tensors[idx]; + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + lt.data = (uint8_t *) lt.ggml_tensor->data; + model_loader->load_data_for(lt); + lt.ggml_tensor->data = lt.data; + } + else { + base_t = dest_t; + } + + if (ggml_is_quantized(base_t->type)) { + if (!warned) { + LLAMA_V3_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + } + + ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; + GGML_ASSERT(loraA->type == GGML_TYPE_F32); + ggml_set_name(loraA, "loraA"); + + ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + GGML_ASSERT(loraB->type == GGML_TYPE_F32); + ggml_set_name(loraB, "loraB"); + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + LLAMA_V3_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + return 1; + } + + // w = w + BA*s + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + offload_func(BA); + ggml_set_name(BA, "BA"); + + if (scaling != 1.0f) { + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + ggml_set_name(scale_tensor, "scale_tensor"); + + BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + offload_func(BA); + ggml_set_name(BA, "BA_scaled"); + } + + ggml_tensor * r; + if (base_t == dest_t) { + r = ggml_add_inplace(lora_ctx, dest_t, BA); + offload_func_force_inplace(r); + ggml_set_name(r, "r_add_inplace"); + } + else { + r = ggml_add(lora_ctx, base_t, BA); + offload_func(r); + ggml_set_name(r, "r_add"); + + r = ggml_cpy(lora_ctx, r, dest_t); + offload_func(r); + ggml_set_name(r, "r_cpy"); + } + + struct ggml_cgraph gf = ggml_build_forward(r); + + llv3_graph_compute_helper(work_buffer, &gf, n_threads); + + // we won't need these tensors again, reset the context to save memory + ggml_free(lora_ctx); + lora_ctx = ggml_init(params); + lora_tensors.clear(); + + n_tensors++; + if (n_tensors % 4 == 0) { + LLAMA_V3_LOG_INFO("."); + } + } + } + + // TODO: this should be in a destructor, it will leak on failure + ggml_free(lora_ctx); + if (base_ctx) { + ggml_free(base_ctx); + } + + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + LLAMA_V3_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); + + return 0; +} + +int llama_v3_apply_lora_from_file(struct llama_v3_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_v3_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); + } catch (const std::exception & err) { + LLAMA_V3_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_v3_model_apply_lora_from_file(const struct llama_v3_model * model, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_v3_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); + } catch (const std::exception & err) { + LLAMA_V3_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_v3_get_kv_cache_token_count(const struct llama_v3_context * ctx) { + return ctx->kv_self.n; +} + +#define LLAMA_V3_MAX_RNG_STATE (64*1024) + +void llama_v3_set_rng_seed(struct llama_v3_context * ctx, uint32_t seed) { + if (seed == LLAMA_V3_DEFAULT_SEED) { + seed = time(NULL); + } + ctx->rng.seed(seed); +} + +// Returns the *maximum* size of the state +size_t llama_v3_get_state_size(const struct llama_v3_context * ctx) { + // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. + // for reference, std::mt19937(1337) serializes to 6701 bytes. + const size_t s_rng_size = sizeof(size_t); + const size_t s_rng = LLAMA_V3_MAX_RNG_STATE; + const size_t s_logits_capacity = sizeof(size_t); + const size_t s_logits_size = sizeof(size_t); + const size_t s_logits = ctx->logits.capacity() * sizeof(float); + const size_t s_embedding_size = sizeof(size_t); + const size_t s_embedding = ctx->embedding.size() * sizeof(float); + const size_t s_kv_size = sizeof(size_t); + const size_t s_kv_ntok = sizeof(int); + const size_t s_kv = ctx->kv_self.buf.size; + + const size_t s_total = ( + + s_rng_size + + s_rng + + s_logits_capacity + + s_logits_size + + s_logits + + s_embedding_size + + s_embedding + + s_kv_size + + s_kv_ntok + + s_kv + ); + + return s_total; +} + +/** copy state data into either a buffer or file depending on the passed in context + * + * file context: + * llama_v3_file file("/path", "wb"); + * llama_v3_data_file_context data_ctx(&file); + * llama_v3_copy_state_data(ctx, &data_ctx); + * + * buffer context: + * std::vector buf(max_size, 0); + * llama_v3_data_buffer_context data_ctx(&buf.data()); + * llama_v3_copy_state_data(ctx, &data_ctx); + * +*/ +void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_data_context * data_ctx) { + // copy rng + { + std::stringstream rng_ss; + rng_ss << ctx->rng; + + const size_t rng_size = rng_ss.str().size(); + char rng_buf[LLAMA_V3_MAX_RNG_STATE]; + + memset(&rng_buf[0], 0, LLAMA_V3_MAX_RNG_STATE); + memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); + + data_ctx->write(&rng_size, sizeof(rng_size)); + data_ctx->write(&rng_buf[0], LLAMA_V3_MAX_RNG_STATE); + } + + // copy logits + { + const size_t logits_cap = ctx->logits.capacity(); + const size_t logits_size = ctx->logits.size(); + + data_ctx->write(&logits_cap, sizeof(logits_cap)); + data_ctx->write(&logits_size, sizeof(logits_size)); + + if (logits_size) { + data_ctx->write(ctx->logits.data(), logits_size * sizeof(float)); + } + + // If there is a gap between the size and the capacity, write padding + size_t padding_size = (logits_cap - logits_size) * sizeof(float); + if (padding_size > 0) { + std::vector padding(padding_size, 0); // Create a buffer filled with zeros + data_ctx->write(padding.data(), padding_size); + } + } + + // copy embeddings + { + const size_t embedding_size = ctx->embedding.size(); + + data_ctx->write(&embedding_size, sizeof(embedding_size)); + + if (embedding_size) { + data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float)); + } + } + + // copy kv cache + { + const auto & kv_self = ctx->kv_self; + const auto & hparams = ctx->model.hparams; + const int n_layer = hparams.n_layer; + const int n_embd = hparams.n_embd_gqa(); + const int n_ctx = hparams.n_ctx; + + const size_t kv_size = kv_self.buf.size; + const int kv_ntok = llama_v3_get_kv_cache_token_count(ctx); + + data_ctx->write(&kv_size, sizeof(kv_size)); + data_ctx->write(&kv_ntok, sizeof(kv_ntok)); + + if (kv_size) { + const size_t elt_size = ggml_element_size(kv_self.k); + + ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); + ggml_cgraph gf{}; + + ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + std::vector kout3d_data(ggml_nbytes(kout3d), 0); + kout3d->data = kout3d_data.data(); + + ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + std::vector vout3d_data(ggml_nbytes(vout3d), 0); + vout3d->data = vout3d_data.data(); + + ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, + n_embd, kv_ntok, n_layer, + elt_size*n_embd, elt_size*n_embd*n_ctx, 0); + + ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, + kv_ntok, n_embd, n_layer, + elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); + + ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); + ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); + llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + + ggml_free(cpy_ctx); + + // our data is now in the kout3d_data and vout3d_data buffers + // write them to file + data_ctx->write(kout3d_data.data(), kout3d_data.size()); + data_ctx->write(vout3d_data.data(), vout3d_data.size()); + } + } +} + +size_t llama_v3_copy_state_data(struct llama_v3_context * ctx, uint8_t * dst) { + llama_v3_data_buffer_context data_ctx(dst); + llama_v3_copy_state_data_internal(ctx, &data_ctx); + + return data_ctx.get_size_written(); +} + +// Sets the state reading from the specified source address +size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) { + uint8_t * inp = src; + + // set rng + { + size_t rng_size; + char rng_buf[LLAMA_V3_MAX_RNG_STATE]; + + memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); + memcpy(&rng_buf[0], inp, LLAMA_V3_MAX_RNG_STATE); inp += LLAMA_V3_MAX_RNG_STATE; + + std::stringstream rng_ss; + rng_ss.str(std::string(&rng_buf[0], rng_size)); + rng_ss >> ctx->rng; + + LLAMA_V3_ASSERT(rng_ss.fail() == false); + } + + // set logits + { + size_t logits_cap; + size_t logits_size; + + memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); + memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); + + LLAMA_V3_ASSERT(ctx->logits.capacity() == logits_cap); + + if (logits_size) { + ctx->logits.resize(logits_size); + memcpy(ctx->logits.data(), inp, logits_size * sizeof(float)); + } + + inp += logits_cap * sizeof(float); + } + + // set embeddings + { + size_t embedding_size; + + memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); + + LLAMA_V3_ASSERT(ctx->embedding.capacity() == embedding_size); + + if (embedding_size) { + memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); + inp += embedding_size * sizeof(float); + } + } + + // set kv cache + { + const auto & kv_self = ctx->kv_self; + const auto & hparams = ctx->model.hparams; + const int n_layer = hparams.n_layer; + const int n_embd = hparams.n_embd_gqa(); + const int n_ctx = hparams.n_ctx; + + size_t kv_size; + int kv_ntok; + + memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); + memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); + + if (kv_size) { + LLAMA_V3_ASSERT(kv_self.buf.size == kv_size); + + const size_t elt_size = ggml_element_size(kv_self.k); + + ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); + ggml_cgraph gf{}; + + ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + kin3d->data = (void *) inp; + inp += ggml_nbytes(kin3d); + + ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + vin3d->data = (void *) inp; + inp += ggml_nbytes(vin3d); + + ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, + n_embd, kv_ntok, n_layer, + elt_size*n_embd, elt_size*n_embd*n_ctx, 0); + + ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, + kv_ntok, n_embd, n_layer, + elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); + + ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); + ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); + llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + + ggml_free(cpy_ctx); + } + + ctx->kv_self.n = kv_ntok; + } + + const size_t nread = inp - src; + const size_t max_size = llama_v3_get_state_size(ctx); + + LLAMA_V3_ASSERT(nread <= max_size); + + return nread; +} + +static bool llama_v3_load_session_file_internal(struct llama_v3_context * ctx, const char * path_session, llama_v3_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_v3_file file(path_session, "rb"); + + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (magic != LLAMA_V3_SESSION_MAGIC || version != LLAMA_V3_SESSION_VERSION) { + LLAMA_V3_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; + } + + llama_v3_hparams session_hparams; + file.read_raw(&session_hparams, sizeof(llama_v3_hparams)); + + if (session_hparams != ctx->model.hparams) { + LLAMA_V3_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__); + return false; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + LLAMA_V3_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; + } + + file.read_raw(tokens_out, sizeof(llama_v3_token) * n_token_count); + *n_token_count_out = n_token_count; + } + + // restore the context state + { + const size_t n_state_size_cur = file.size - file.tell(); + const size_t n_state_size_max = llama_v3_get_state_size(ctx); + + if (n_state_size_cur > n_state_size_max) { + LLAMA_V3_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); + return false; + } + + std::vector state_data(n_state_size_max); + file.read_raw(state_data.data(), n_state_size_cur); + + llama_v3_set_state_data(ctx, state_data.data()); + } + + return true; +} + +bool llama_v3_load_session_file(struct llama_v3_context * ctx, const char * path_session, llama_v3_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + try { + return llama_v3_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); + } catch (const std::exception & err) { + LLAMA_V3_LOG_ERROR("error loading session file: %s\n", err.what()); + return false; + } +} + +bool llama_v3_save_session_file(struct llama_v3_context * ctx, const char * path_session, const llama_v3_token * tokens, size_t n_token_count) { + llama_v3_file file(path_session, "wb"); + + file.write_u32(LLAMA_V3_SESSION_MAGIC); + file.write_u32(LLAMA_V3_SESSION_VERSION); + + file.write_raw(&ctx->model.hparams, sizeof(llama_v3_hparams)); + + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_v3_token) * n_token_count); + + // save the context state using stream saving + llama_v3_data_file_context data_ctx(&file); + llama_v3_copy_state_data_internal(ctx, &data_ctx); + + return true; +} + +int llama_v3_eval( + struct llama_v3_context * ctx, + const llama_v3_token * tokens, + int n_tokens, + int n_past, + int n_threads) { + if (!llama_v3_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) { + LLAMA_V3_LOG_ERROR("%s: failed to eval\n", __func__); + return 1; + } + + // get a more accurate load time, upon first eval + // TODO: fix this + if (!ctx->has_evaluated_once) { + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->has_evaluated_once = true; + } + + return 0; +} + + +int llama_v3_eval_embd( + struct llama_v3_context * ctx, + const float * embd, + int n_tokens, + int n_past, + int n_threads) { + if (!llama_v3_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) { + LLAMA_V3_LOG_ERROR("%s: failed to eval\n", __func__); + return 1; + } + + // get a more accurate load time, upon first eval + // TODO: fix this + if (!ctx->has_evaluated_once) { + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->has_evaluated_once = true; + } + + return 0; +} + +int llama_v3_eval_export(struct llama_v3_context * ctx, const char * fname) { + const int n_batch = 1; + const int n_ctx = 512 - n_batch; + + const std::vector tmp(n_batch, llama_v3_token_bos()); + + if (!llama_v3_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) { + LLAMA_V3_LOG_ERROR("%s: failed to eval\n", __func__); + return 1; + } + + return 0; +} + +int llama_v3_tokenize_with_model( + const struct llama_v3_model * model, + const char * text, + llama_v3_token * tokens, + int n_max_tokens, + bool add_bos) { + auto res = llama_v3_tokenize(model->vocab, text, add_bos); + + if (n_max_tokens < (int) res.size()) { + LLAMA_V3_LOG_ERROR("%s: too many tokens\n", __func__); + return -((int) res.size()); + } + + for (size_t i = 0; i < res.size(); i++) { + tokens[i] = res[i]; + } + + return res.size(); +} + +int llama_v3_tokenize( + struct llama_v3_context * ctx, + const char * text, + llama_v3_token * tokens, + int n_max_tokens, + bool add_bos) { + return llama_v3_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); +} + +int llama_v3_n_vocab_from_model(const struct llama_v3_model * model) { + return model->vocab.id_to_token.size(); +} + +int llama_v3_n_ctx_from_model(const struct llama_v3_model * model) { + return model->hparams.n_ctx; +} + +int llama_v3_n_embd_from_model(const struct llama_v3_model * model) { + return model->hparams.n_embd; +} + +int llama_v3_n_vocab(const struct llama_v3_context * ctx) { + return ctx->model.vocab.id_to_token.size(); +} + +int llama_v3_n_ctx(const struct llama_v3_context * ctx) { + return ctx->model.hparams.n_ctx; +} + +int llama_v3_n_embd(const struct llama_v3_context * ctx) { + return ctx->model.hparams.n_embd; +} + +int llama_v3_model_type(const struct llama_v3_model * model, char * buf, size_t buf_size) { + return snprintf(buf, buf_size, "LLaMA %s %s", llama_v3_model_type_name(model->type), llama_v3_ftype_name(model->hparams.ftype)); +} + +int llama_v3_get_vocab_from_model( + const struct llama_v3_model * model, + const char * * strings, + float * scores, + int capacity) { + int n = std::min(capacity, (int) model->vocab.id_to_token.size()); + for (int i = 0; ivocab.id_to_token[i].tok.c_str(); + scores[i] = model->vocab.id_to_token[i].score; + } + return n; +} + +int llama_v3_get_vocab( + const struct llama_v3_context * ctx, + const char * * strings, + float * scores, + int capacity) { + return llama_v3_get_vocab_from_model(&ctx->model, strings, scores, capacity); +} + +float * llama_v3_get_logits(struct llama_v3_context * ctx) { + return ctx->logits.data(); +} + +float * llama_v3_get_embeddings(struct llama_v3_context * ctx) { + return ctx->embedding.data(); +} + +const char * llama_v3_token_to_str_with_model(const struct llama_v3_model * model, llama_v3_token token) { + if (token >= llama_v3_n_vocab_from_model(model)) { + return nullptr; + } + + return model->vocab.id_to_token[token].tok.c_str(); +} + +const char * llama_v3_token_to_str(const struct llama_v3_context * ctx, llama_v3_token token) { + return llama_v3_token_to_str_with_model(&ctx->model, token); +} + +llama_v3_token llama_v3_token_bos() { + return 1; +} + +llama_v3_token llama_v3_token_eos() { + return 2; +} + +llama_v3_token llama_v3_token_nl() { + return 13; +} + +struct llama_v3_timings llama_v3_get_timings(struct llama_v3_context * ctx) { + struct llama_v3_timings result = { + /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, + /*.t_end_ms =*/ 1.00 * ggml_time_ms(), + /*.t_load_ms =*/ 1e-3 * ctx->t_load_us, + /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us, + /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us, + /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, + + /*.n_sample =*/ std::max(1, ctx->n_sample), + /*.n_p_eval =*/ std::max(1, ctx->n_p_eval), + /*.n_eval =*/ std::max(1, ctx->n_eval), + }; + + return result; +} + +void llama_v3_print_timings(struct llama_v3_context * ctx) { + const llama_v3_timings timings = llama_v3_get_timings(ctx); + + LLAMA_V3_LOG_INFO("\n"); + LLAMA_V3_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); + LLAMA_V3_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); + LLAMA_V3_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); + LLAMA_V3_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); + LLAMA_V3_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); +} + +void llama_v3_reset_timings(struct llama_v3_context * ctx) { + ctx->t_start_us = ggml_time_us(); + ctx->t_sample_us = ctx->n_sample = 0; + ctx->t_eval_us = ctx->n_eval = 0; + ctx->t_p_eval_us = ctx->n_p_eval = 0; +} + +const char * llama_v3_print_system_info(void) { + static std::string s; + + s = ""; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; + s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; + s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + + return s.c_str(); +} + +// For internal test use +const std::vector>& llama_v3_internal_get_tensor_map(struct llama_v3_context * ctx) { + return ctx->model.tensors_by_name; +} + + +void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data) { + llv3_g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default; + llv3_g_state.log_callback_user_data = user_data; +} + +#if defined(_MSC_VER) && !defined(vsnprintf) +#define vsnprintf _vsnprintf +#endif + +static void llama_v3_log_internal_v(llama_v3_log_level level, const char * format, va_list args) { + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + llv3_g_state.log_callback(level, buffer, llv3_g_state.log_callback_user_data); + } else { + char* buffer2 = new char[len+1]; + vsnprintf(buffer2, len+1, format, args_copy); + buffer2[len] = 0; + llv3_g_state.log_callback(level, buffer2, llv3_g_state.log_callback_user_data); + delete[] buffer2; + } + va_end(args_copy); +} + +static void llama_v3_log_internal(llama_v3_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + llama_v3_log_internal_v(level, format, args); + va_end(args); +} + +static void llama_v3_log_callback_default(llama_v3_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} diff --git a/otherarch/llama_v3.h b/otherarch/llama_v3.h new file mode 100644 index 0000000000000000000000000000000000000000..2cc4b47074158d919436d6c5a67abff227c92293 --- /dev/null +++ b/otherarch/llama_v3.h @@ -0,0 +1,486 @@ +#ifndef LLAMA_V3_H +#define LLAMA_V3_H + +#include "ggml.h" +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#define LLAMA_V3_MAX_DEVICES GGML_CUDA_MAX_DEVICES +#else +#define LLAMA_V3_MAX_DEVICES 1 +#endif // GGML_USE_CUBLAS +#include +#include +#include + +#ifdef LLAMA_V3_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_V3_BUILD +# define LLAMA_V3_API __declspec(dllexport) +# else +# define LLAMA_V3_API __declspec(dllimport) +# endif +# else +# define LLAMA_V3_API __attribute__ ((visibility ("default"))) +# endif +#else +# define LLAMA_V3_API +#endif + +#ifdef __GNUC__ +# define DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define DEPRECATED(func, hint) func +#endif + +#define LLAMA_V3_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +#define LLAMA_V3_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +#define LLAMA_V3_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +#define LLAMA_V3_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +#define LLAMA_V3_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' + +#define LLAMA_V3_FILE_VERSION 3 +#define LLAMA_V3_FILE_MAGIC LLAMA_V3_FILE_MAGIC_GGJT +#define LLAMA_V3_FILE_MAGIC_UNVERSIONED LLAMA_V3_FILE_MAGIC_GGML +#define LLAMA_V3_SESSION_MAGIC LLAMA_V3_FILE_MAGIC_GGSN +#define LLAMA_V3_SESSION_VERSION 1 + +#define LLAMA_V3_DEFAULT_SEED 0xFFFFFFFF + +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) +// Defined when llama.cpp is compiled with support for offloading model layers to GPU. +#define LLAMA_V3_SUPPORTS_GPU_OFFLOAD +#endif + +#ifndef LLAMA_V3_DEFAULT_RMS_EPS +#define LLAMA_V3_DEFAULT_RMS_EPS 5e-6f +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + // + // C interface + // + // TODO: show sample usage + // + + struct llama_v3_model; + struct llama_v3_context; + + typedef int llama_v3_token; + + typedef struct llama_v3_token_data { + llama_v3_token id; // token id + float logit; // log-odds of the token + float p; // probability of the token + } llama_v3_token_data; + + typedef struct llama_v3_token_data_array { + llama_v3_token_data * data; + size_t size; + bool sorted; + } llama_v3_token_data_array; + + typedef void (*llama_v3_progress_callback)(float progress, void *ctx); + + enum llama_v3_log_level { + LLAMA_V3_LOG_LEVEL_ERROR = 2, + LLAMA_V3_LOG_LEVEL_WARN = 3, + LLAMA_V3_LOG_LEVEL_INFO = 4 + }; + + // Signature for logging events + // Note that text includes the new line character at the end for most events. + // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it + // if it exists. + // It might not exist for progress report where '.' is output repeatedly. + typedef void (*llama_v3_log_callback)(enum llama_v3_log_level level, const char * text, void * user_data); + + struct llama_v3_context_params { + uint32_t seed; // RNG seed, -1 for random + int32_t n_ctx; // text context + int32_t n_batch; // prompt processing batch size + int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) + float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams) + int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t main_gpu; // the GPU that is used for scratch and small tensors + + const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_V3_MAX_DEVICES) + + // ref: https://github.com/ggerganov/llama.cpp/pull/2054 + float rope_freq_base; // RoPE base frequency + float rope_freq_scale; // RoPE frequency scaling factor + + // called with a progress value between 0 and 1, pass NULL to disable + llama_v3_progress_callback progress_callback; + // context pointer passed to the progress callback + void * progress_callback_user_data; + + // Keep the booleans together to avoid misalignment during copy-by-value. + bool low_vram; // if true, reduce VRAM usage at the cost of performance + bool mul_mat_q; // if true, use experimental mul_mat_q kernels + bool f16_kv; // use fp16 for KV cache + bool logits_all; // the llama_v3_eval() call computes all logits, not just the last one + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool embedding; // embedding mode only + }; + // model file types + enum llama_v3_ftype { + LLAMA_V3_FTYPE_ALL_F32 = 0, + LLAMA_V3_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + // LLAMA_V3_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed + // LLAMA_V3_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed + LLAMA_V3_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors + LLAMA_V3_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors + }; + + // model quantization parameters + typedef struct llama_v3_model_quantize_params { + int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_v3_ftype ftype; // quantize to this llama_v3_ftype + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + } llama_v3_model_quantize_params; + + // grammar types + struct llama_v3_grammar; + + // grammar element type + enum llama_v3_gretype { + // end of rule definition + LLAMA_V3_GRETYPE_END = 0, + + // start of alternate definition for rule + LLAMA_V3_GRETYPE_ALT = 1, + + // non-terminal element: reference to rule + LLAMA_V3_GRETYPE_RULE_REF = 2, + + // terminal element: character (code point) + LLAMA_V3_GRETYPE_CHAR = 3, + + // inverse char(s) ([^a], [^a-b] [^abc]) + LLAMA_V3_GRETYPE_CHAR_NOT = 4, + + // modifies a preceding LLAMA_V3_GRETYPE_CHAR or LLAMA_V3_GRETYPE_CHAR_ALT to + // be an inclusive range ([a-z]) + LLAMA_V3_GRETYPE_CHAR_RNG_UPPER = 5, + + // modifies a preceding LLAMA_V3_GRETYPE_CHAR or + // LLAMA_V3_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) + LLAMA_V3_GRETYPE_CHAR_ALT = 6, + }; + + typedef struct llama_v3_grammar_element { + enum llama_v3_gretype type; + uint32_t value; // Unicode code point or rule ID + } llama_v3_grammar_element; + + // performance timing information + struct llama_v3_timings { + double t_start_ms; + double t_end_ms; + double t_load_ms; + double t_sample_ms; + double t_p_eval_ms; + double t_eval_ms; + + int32_t n_sample; + int32_t n_p_eval; + int32_t n_eval; + }; + + // Set callback for all future logging events. + // If this is not called, or NULL is supplied, everything is output on stderr. + LLAMA_V3_API void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data); + + LLAMA_V3_API int llama_v3_max_devices(); + + LLAMA_V3_API struct llama_v3_context_params llama_v3_context_default_params(); + LLAMA_V3_API struct llama_v3_model_quantize_params llama_v3_model_quantize_default_params(); + + LLAMA_V3_API bool llama_v3_mmap_supported(); + LLAMA_V3_API bool llama_v3_mlock_supported(); + + // TODO: not great API - very likely to change + // Initialize the llama + ggml backend + // If numa is true, use NUMA optimizations + // Call once at the start of the program + LLAMA_V3_API void llama_v3_backend_init(bool numa); + // Call once at the end of the program - currently only used for MPI + LLAMA_V3_API void llama_v3_backend_free(); + + LLAMA_V3_API int64_t llama_v3_time_us(); + + LLAMA_V3_API struct llama_v3_model * llama_v3_load_model_from_file( + const char * path_model, + struct llama_v3_context_params params); + + LLAMA_V3_API void llama_v3_free_model(struct llama_v3_model * model); + + LLAMA_V3_API struct llama_v3_context * llama_v3_new_context_with_model( + struct llama_v3_model * model, + struct llama_v3_context_params params); + + // Various functions for loading a ggml llama model. + // Allocate (almost) all memory needed for the model. + // Return NULL on failure + LLAMA_V3_API struct llama_v3_context * llama_v3_init_from_file( + const char * path_model, + struct llama_v3_context_params params); + + // Frees all allocated memory + LLAMA_V3_API void llama_v3_free(struct llama_v3_context * ctx); + + // Returns 0 on success + LLAMA_V3_API int llama_v3_model_quantize( + const char * fname_inp, + const char * fname_out, + const llama_v3_model_quantize_params * params); + + // Apply a LoRA adapter to a loaded model + // path_base_model is the path to a higher quality model to use as a base for + // the layers modified by the adapter. Can be NULL to use the current loaded model. + // The model needs to be reloaded before applying a new adapter, otherwise the adapter + // will be applied on top of the previous one + // Returns 0 on success + LLAMA_V3_API int llama_v3_apply_lora_from_file( + struct llama_v3_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads); + + LLAMA_V3_API int llama_v3_model_apply_lora_from_file( + const struct llama_v3_model * model, + const char * path_lora, + const char * path_base_model, + int n_threads); + + // Returns the number of tokens in the KV cache + LLAMA_V3_API int llama_v3_get_kv_cache_token_count(const struct llama_v3_context * ctx); + + // Sets the current rng seed. + LLAMA_V3_API void llama_v3_set_rng_seed(struct llama_v3_context * ctx, uint32_t seed); + + // Returns the maximum size in bytes of the state (rng, logits, embedding + // and kv_cache) - will often be smaller after compacting tokens + LLAMA_V3_API size_t llama_v3_get_state_size(const struct llama_v3_context * ctx); + + // Copies the state to the specified destination address. + // Destination needs to have allocated enough memory. + // Returns the number of bytes copied + LLAMA_V3_API size_t llama_v3_copy_state_data(struct llama_v3_context * ctx, uint8_t * dst); + + // Set the state reading from the specified address + // Returns the number of bytes read + LLAMA_V3_API size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src); + + // Save/load session file + LLAMA_V3_API bool llama_v3_load_session_file(struct llama_v3_context * ctx, const char * path_session, llama_v3_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); + LLAMA_V3_API bool llama_v3_save_session_file(struct llama_v3_context * ctx, const char * path_session, const llama_v3_token * tokens, size_t n_token_count); + + // Run the llama inference to obtain the logits and probabilities for the next token. + // tokens + n_tokens is the provided batch of new tokens to process + // n_past is the number of tokens to use from previous eval calls + // Returns 0 on success + LLAMA_V3_API int llama_v3_eval( + struct llama_v3_context * ctx, + const llama_v3_token * tokens, + int n_tokens, + int n_past, + int n_threads); + + // Same as llama_v3_eval, but use float matrix input directly. + LLAMA_V3_API int llama_v3_eval_embd( + struct llama_v3_context * ctx, + const float * embd, + int n_tokens, + int n_past, + int n_threads); + + // Export a static computation graph for context of 511 and batch size of 1 + // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these + // parameters here to keep things simple + // IMPORTANT: do not use for anything else other than debugging and testing! + LLAMA_V3_API int llama_v3_eval_export(struct llama_v3_context * ctx, const char * fname); + + // Convert the provided text into tokens. + // The tokens pointer must be large enough to hold the resulting tokens. + // Returns the number of tokens on success, no more than n_max_tokens + // Returns a negative number on failure - the number of tokens that would have been returned + // TODO: not sure if correct + LLAMA_V3_API int llama_v3_tokenize( + struct llama_v3_context * ctx, + const char * text, + llama_v3_token * tokens, + int n_max_tokens, + bool add_bos); + + LLAMA_V3_API int llama_v3_tokenize_with_model( + const struct llama_v3_model * model, + const char * text, + llama_v3_token * tokens, + int n_max_tokens, + bool add_bos); + + LLAMA_V3_API int llama_v3_n_vocab(const struct llama_v3_context * ctx); + LLAMA_V3_API int llama_v3_n_ctx (const struct llama_v3_context * ctx); + LLAMA_V3_API int llama_v3_n_embd (const struct llama_v3_context * ctx); + + LLAMA_V3_API int llama_v3_n_vocab_from_model(const struct llama_v3_model * model); + LLAMA_V3_API int llama_v3_n_ctx_from_model (const struct llama_v3_model * model); + LLAMA_V3_API int llama_v3_n_embd_from_model (const struct llama_v3_model * model); + + LLAMA_V3_API int llama_v3_model_type(const struct llama_v3_model * model, char * buf, size_t buf_size); + + // Get the vocabulary as output parameters. + // Returns number of results. + LLAMA_V3_API int llama_v3_get_vocab( + const struct llama_v3_context * ctx, + const char * * strings, + float * scores, + int capacity); + + LLAMA_V3_API int llama_v3_get_vocab_from_model( + const struct llama_v3_model * model, + const char * * strings, + float * scores, + int capacity); + + // Token logits obtained from the last call to llama_v3_eval() + // The logits for the last token are stored in the last row + // Can be mutated in order to change the probabilities of the next token + // Rows: n_tokens + // Cols: n_vocab + LLAMA_V3_API float * llama_v3_get_logits(struct llama_v3_context * ctx); + + // Get the embeddings for the input + // shape: [n_embd] (1-dimensional) + LLAMA_V3_API float * llama_v3_get_embeddings(struct llama_v3_context * ctx); + + // Token Id -> String. Uses the vocabulary in the provided context + LLAMA_V3_API const char * llama_v3_token_to_str( + const struct llama_v3_context * ctx, + llama_v3_token token); + + LLAMA_V3_API const char * llama_v3_token_to_str_with_model( + const struct llama_v3_model * model, + llama_v3_token token); + + // Special tokens + LLAMA_V3_API llama_v3_token llama_v3_token_bos(); // beginning-of-sentence + LLAMA_V3_API llama_v3_token llama_v3_token_eos(); // end-of-sentence + LLAMA_V3_API llama_v3_token llama_v3_token_nl(); // next-line + + // Grammar + // + LLAMA_V3_API struct llama_v3_grammar * llama_v3_grammar_init( + const llama_v3_grammar_element ** rules, + size_t n_rules, + size_t start_rule_index); + + LLAMA_V3_API void llama_v3_grammar_free(struct llama_v3_grammar * grammar); + + // Sampling functions + + /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. + LLAMA_V3_API void llama_v3_sample_repetition_penalty(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, const llama_v3_token * last_tokens, size_t last_tokens_size, float penalty); + + /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. + LLAMA_V3_API void llama_v3_sample_frequency_and_presence_penalties(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, const llama_v3_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); + + /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 + /// @param candidates A vector of `llama_v3_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. + /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. + /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. + LLAMA_V3_API void llama_v3_sample_classifier_free_guidance( + struct llama_v3_context * ctx, + llama_v3_token_data_array * candidates, + struct llama_v3_context * guidance_ctx, + float scale); + + /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. + LLAMA_V3_API void llama_v3_sample_softmax(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates); + + /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + LLAMA_V3_API void llama_v3_sample_top_k(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, int k, size_t min_keep); + + /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + LLAMA_V3_API void llama_v3_sample_top_p(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float p, size_t min_keep); + + /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. + LLAMA_V3_API void llama_v3_sample_tail_free(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float z, size_t min_keep); + + /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. + LLAMA_V3_API void llama_v3_sample_typical(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float p, size_t min_keep); + LLAMA_V3_API void llama_v3_sample_temperature(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float temp); + + /// @details Apply constraints from grammar + LLAMA_V3_API void llama_v3_sample_grammar(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, const struct llama_v3_grammar * grammar); + + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. + /// @param candidates A vector of `llama_v3_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. + /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. + LLAMA_V3_API llama_v3_token llama_v3_sample_token_mirostat(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float tau, float eta, int m, float * mu); + + /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. + /// @param candidates A vector of `llama_v3_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. + LLAMA_V3_API llama_v3_token llama_v3_sample_token_mirostat_v2(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates, float tau, float eta, float * mu); + + /// @details Selects the token with the highest probability. + LLAMA_V3_API llama_v3_token llama_v3_sample_token_greedy(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates); + + /// @details Randomly selects a token from the candidates based on their probabilities. + LLAMA_V3_API llama_v3_token llama_v3_sample_token(struct llama_v3_context * ctx, llama_v3_token_data_array * candidates); + + /// @details Accepts the sampled token into the grammar + LLAMA_V3_API void llama_v3_grammar_accept_token(struct llama_v3_context * ctx, struct llama_v3_grammar * grammar, llama_v3_token token); + + // Performance information + LLAMA_V3_API struct llama_v3_timings llama_v3_get_timings(struct llama_v3_context * ctx); + LLAMA_V3_API void llama_v3_print_timings(struct llama_v3_context * ctx); + LLAMA_V3_API void llama_v3_reset_timings(struct llama_v3_context * ctx); + + // Print system information + LLAMA_V3_API const char * llama_v3_print_system_info(void); + +#ifdef __cplusplus +} +#endif + +// Internal API to be implemented by llama.cpp and used by tests/benchmarks only +#ifdef LLAMA_V3_API_INTERNAL + +#include +#include +struct ggml_tensor; + +const std::vector>& llama_v3_internal_get_tensor_map(struct llama_v3_context * ctx); + +#endif + +#endif // LLAMA_V3_H diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 6d2b46568176ae76d4df0e2a26c88a904e077f5e..57ed90888fb5309b861c5ebe917c6bd1dfc667c3 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -301,7 +301,11 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo const auto & hparams = model.hparams; size_t vram_total = 0; const int n_gpu = std::min(gpulayers, int(hparams.n_layers)); - fprintf(stderr, "%s: [GPU] offloading %d layers to GPU\n", __func__, n_gpu); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + #else + fprintf(stderr, "%s: [CUDA] offloading %d layers to GPU\n", __func__, n_gpu); + #endif for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; layer.ffn_up_proj->backend = GGML_BACKEND_GPU; @@ -320,7 +324,11 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo ggml_cuda_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight); #endif } - fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #else + fprintf(stderr, "%s: [CUDA] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #endif } #endif @@ -399,7 +407,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // a = self.ln_1(x) { - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, default_norm_eps); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur); } @@ -497,7 +505,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // m = self.ln_2(x) { - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, default_norm_eps); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur); } @@ -525,7 +533,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // norm { - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_norm(ctx0, inpL, default_norm_eps); // inpL = ln_f_g*inpL inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL); } diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 61bc9230e90eb40f11aa3cb5e1c195b78b9447d2..d9fb93b2863731b9e25773d5070e9357c7fdaf64 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -335,7 +335,11 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const auto & hparams = model.hparams; size_t vram_total = 0; const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); - fprintf(stderr, "%s: [GPU] offloading %d layers to GPU\n", __func__, n_gpu); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + #else + fprintf(stderr, "%s: [CUDA] offloading %d layers to GPU\n", __func__, n_gpu); + #endif for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; layer.c_attn_attn_w->backend = GGML_BACKEND_GPU; @@ -354,7 +358,11 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & ggml_cuda_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); #endif } - fprintf(stderr, "%s: [GPU] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #if defined(GGML_USE_CLBLAST) + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #else + fprintf(stderr, "%s: [CUDA] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + #endif } #endif @@ -367,7 +375,7 @@ ggml_tensor * gpt_neox_ff( const gpt_neox_layer &layer, ggml_context * ctx0, ggml_tensor * inp) { - ggml_tensor * cur = ggml_norm(ctx0, inp); + ggml_tensor * cur = ggml_norm(ctx0, inp, default_norm_eps); cur = ggml_add(ctx0, ggml_mul(ctx0, @@ -481,7 +489,7 @@ bool gpt_neox_eval( // self-attention { { - cur = ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, default_norm_eps); cur = ggml_add(ctx0, ggml_mul(ctx0, @@ -613,7 +621,7 @@ bool gpt_neox_eval( // norm { - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_norm(ctx0, inpL, default_norm_eps); // inpL = ln_f_g*inpL + ln_f_b inpL = ggml_add(ctx0, @@ -663,4 +671,4 @@ bool gpt_neox_eval( ggml_free(ctx0); return true; -} \ No newline at end of file +} diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index a1ec3bf634bdf30a35bc4abe2b344034c078c0ad..f4a39b12bbdb65ffbba03b2c1616e34fcd9ea7fb 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -458,4 +458,4 @@ struct mpt_model { std::map tensors; }; - +const float default_norm_eps = 1e-5f; diff --git a/otherarch/rwkv_v2.cpp b/otherarch/rwkv_v2.cpp index d627a13f0da34034f950be24ffa83f5ccddb6d14..7d2065eaa3b29205c4e0bef2cb44f8b6ff800903 100644 --- a/otherarch/rwkv_v2.cpp +++ b/otherarch/rwkv_v2.cpp @@ -367,8 +367,8 @@ struct rwkv_v2_context * rwkv_v2_init_from_file(const char * file_path, uint32_t // Verify order of dimensions struct ggml_v2_tensor * emb = model->emb; RWKV_V2_ASSERT_NULL(emb->n_dims == 2, "Unexpected dimension count of embedding matrix %d", emb->n_dims); - RWKV_V2_ASSERT_NULL(emb->ne[0] == model->n_embed, "Unexpected dimension of embedding matrix %lld", emb->ne[0]); - RWKV_V2_ASSERT_NULL(emb->ne[1] == model->n_vocab, "Unexpected dimension of embedding matrix %lld", emb->ne[1]); + RWKV_V2_ASSERT_NULL(emb->ne[0] == model->n_embed, "Unexpected dimension of embedding matrix %ld", emb->ne[0]); + RWKV_V2_ASSERT_NULL(emb->ne[1] == model->n_vocab, "Unexpected dimension of embedding matrix %ld", emb->ne[1]); int32_t n_embed = model->n_embed; int32_t n_layer = model->n_layer; diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index 3bdf221fdfb811af9a48aaf0068fa56d352d010c..cbabab5a87cea678aa03a28f992607e87cae052f 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -304,6 +304,14 @@ struct rwkv_tensor { uint8_t * data; }; +//rwkv relied on the old ggml_nbytes implementation, so backport it here. Fixes breaking change in PR 2874 +size_t rwkv_nbytes_old(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + auto a = tensor->ne[3]*tensor->nb[3]; + auto b = (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type); + return ((a) > (b) ? (a) : (b)); +} + bool rwkv_fread_tensor_header(FILE * file, struct rwkv_tensor_header & header) { RWKV_ASSERT_FALSE(RWKV_ERROR_FILE_READ, rwkv_fread_data(file, sizeof(struct rwkv_tensor_header) - sizeof(uint32_t), &header)); header.height = 1; @@ -371,7 +379,7 @@ bool rwkv_fread_ggml_tensor_data(FILE * file, const struct rwkv_tensor_header & RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, tensor, "Failed to allocate tensor"); ggml_set_name(tensor, name.c_str()); - RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE_READ, rwkv_fread_data(file, ggml_nbytes(tensor), tensor->data), "Failed to read tensor data from %s", name.c_str()); + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE_READ, rwkv_fread_data(file, rwkv_nbytes_old(tensor), tensor->data), "Failed to read tensor data from %s", name.c_str()); return true; } @@ -477,7 +485,7 @@ struct ggml_tensor * rwkv_max(ggml_context * ctx, struct ggml_tensor * x, struct struct ggml_tensor * rwkv_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) { // LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias` // Looks like ggml_norm does the first part, we only need to apply weight & bias. - return ggml_add_inplace(ctx, ggml_mul_inplace(ctx, ggml_norm(ctx, x), weight), bias); + return ggml_add_inplace(ctx, ggml_mul_inplace(ctx, ggml_norm(ctx, x, default_norm_eps), weight), bias); } // --- Implementation --- @@ -539,7 +547,7 @@ struct rwkv_future_tensor { decoy.ne[1] = height; decoy.ne[2] = 1; decoy.ne[3] = 1; - return ggml_nbytes(&decoy); + return rwkv_nbytes_old(&decoy); } rwkv_future_tensor() {} @@ -1595,7 +1603,7 @@ bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) void rwkv_set_inputs(const struct rwkv_context * ctx, const float * state_in) { if (state_in) { - memcpy(ctx->input_state->data, state_in, ggml_nbytes(ctx->input_state)); + memcpy(ctx->input_state->data, state_in, rwkv_nbytes_old(ctx->input_state)); } else { rwkv_init_state(ctx, (float *) ctx->input_state->data); } @@ -1603,11 +1611,11 @@ void rwkv_set_inputs(const struct rwkv_context * ctx, const float * state_in) { void rwkv_get_outputs(const struct rwkv_context * ctx, float * state_out, float * logits_out) { if (state_out) { - memcpy(state_out, ctx->output_state->data, ggml_nbytes(ctx->output_state)); + memcpy(state_out, ctx->output_state->data, rwkv_nbytes_old(ctx->output_state)); } if (logits_out) { - memcpy(logits_out, ctx->logits->data, ggml_nbytes(ctx->logits)); + memcpy(logits_out, ctx->logits->data, rwkv_nbytes_old(ctx->logits)); } } diff --git a/prompts/chat-with-baichuan.txt b/prompts/chat-with-baichuan.txt new file mode 100644 index 0000000000000000000000000000000000000000..11626b692531fde9ba6cb5eaa652f80a8328e11a --- /dev/null +++ b/prompts/chat-with-baichuan.txt @@ -0,0 +1,4 @@ +以下内容为人类用户与与一位智能助手的对话。 + +用户:你好! +助手: diff --git a/requirements.txt b/requirements.txt index 6c32cbd047b84aadd0eb440ffeca4c771990befd..1f831d56b4510578ef6b103bf3e3a995d76ac442 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ numpy==1.24 sentencepiece==0.1.98 +gguf>=0.1.0 +customtkinter>=5.1.0 \ No newline at end of file diff --git a/run_with_preset.py b/run_with_preset.py new file mode 100644 index 0000000000000000000000000000000000000000..9b4d7ecbe82d4381b08d696263030092395f10be --- /dev/null +++ b/run_with_preset.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +import argparse +import os +import subprocess +import sys + +import yaml + +CLI_ARGS_MAIN_PERPLEXITY = [ + "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", + "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", + "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct", + "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base", + "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock", + "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q", + "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt", + "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n", + "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed", + "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical", + "verbose-prompt" +] + +CLI_ARGS_LLAMA_BENCH = [ + "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers", + "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose" +] + +CLI_ARGS_SERVER = [ + "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base", + "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q", + "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split", + "threads", "verbose" +] + +description = """Run llama.cpp binaries with presets from YAML file(s). +To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported). +To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument. + +Formatting considerations: +- The YAML property names are the same as the CLI argument names of the corresponding binary. +- Properties must use the long name of their corresponding llama.cpp CLI arguments. +- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores. +- Flags must be defined as ": true" to be effective. +- To define the logit_bias property, the expected format is ": " in the "logit_bias" namespace. +- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings. +- To define a tensor split, pass a list of floats. +""" +usage = "run_with_preset.py [-h] [yaml_files ...] [-- ...]" +epilog = (" -- specify additional CLI ars to be passed to the binary (override all preset files). " + "Unknown args will be ignored.") + +parser = argparse.ArgumentParser( + description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument("-bin", "--binary", help="The binary to run.") +parser.add_argument("yaml_files", nargs="*", + help="Arbitrary number of YAML files from which to read preset values. " + "If two files specify the same values the later one will be used.") + +known_args, unknown_args = parser.parse_known_args() + +if not known_args.yaml_files and not unknown_args: + parser.print_help() + sys.exit(0) + +props = dict() + +for yaml_file in known_args.yaml_files: + with open(yaml_file, "r") as f: + props.update(yaml.load(f, yaml.SafeLoader)) + +props = {prop.replace("_", "-"): val for prop, val in props.items()} + +binary = props.pop("binary", "main") +if known_args.binary: + binary = known_args.binary + +if os.path.exists(f"./{binary}"): + binary = f"./{binary}" + +if binary.lower().endswith("main") or binary.lower().endswith("perplexity"): + cli_args = CLI_ARGS_MAIN_PERPLEXITY +elif binary.lower().endswith("llama-bench"): + cli_args = CLI_ARGS_LLAMA_BENCH +elif binary.lower().endswith("server"): + cli_args = CLI_ARGS_SERVER +else: + print(f"Unknown binary: {binary}") + sys.exit(1) + +command_list = [binary] + +for cli_arg in cli_args: + value = props.pop(cli_arg, None) + + if not value or value == -1: + continue + + if cli_arg == "logit-bias": + for token, bias in value.items(): + command_list.append("--logit-bias") + command_list.append(f"{token}{bias:+}") + continue + + if cli_arg == "reverse-prompt" and not isinstance(value, str): + for rp in value: + command_list.append("--reverse-prompt") + command_list.append(str(rp)) + continue + + command_list.append(f"--{cli_arg}") + + if cli_arg == "tensor-split": + command_list.append(",".join([str(v) for v in value])) + continue + + value = str(value) + + if value != "True": + command_list.append(str(value)) + +num_unused = len(props) +if num_unused > 10: + print(f"The preset file contained a total of {num_unused} unused properties.") +elif num_unused > 0: + print("The preset file contained the following unused properties:") + for prop, value in props.items(): + print(f" {prop}: {value}") + +command_list += unknown_args + +sp = subprocess.Popen(command_list) + +while sp.returncode is None: + try: + sp.wait() + except KeyboardInterrupt: + pass + +sys.exit(sp.returncode) diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..e1fadc361080272e8e8c92ec3fbb89d7d4cf9f2c --- /dev/null +++ b/scripts/LlamaConfig.cmake.in @@ -0,0 +1,69 @@ +set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@) +set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) +set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) +set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) +set(LLAMA_BLAS @LLAMA_BLAS@) +set(LLAMA_CUBLAS @LLAMA_CUBLAS@) +set(LLAMA_METAL @LLAMA_METAL@) +set(LLAMA_MPI @LLAMA_MPI@) +set(LLAMA_CLBLAST @LLAMA_CLBLAST@) +set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@) +set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@) + +@PACKAGE_INIT@ + +set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") +set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") +set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") + +# Ensure transient dependencies satisfied + +find_package(Threads REQUIRED) +if (APPLE AND LLAMA_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED) +endif() + +if (LLAMA_BLAS) + find_package(BLAS REQUIRED) +endif() + +if (LLAMA_CUBLAS) + find_package(CUDAToolkit REQUIRED) +endif() + +if (LLAMA_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) +endif() + +if (LLAMA_MPI) + find_package(MPI REQUIRED) +endif() + +if (LLAMA_CLBLAST) + find_package(CLBlast REQUIRED) +endif() + +if (LLAMA_HIPBLAS) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) +endif() + +find_library(llama_LIBRARY llama + REQUIRED + HINTS ${LLAMA_LIB_DIR}) + +set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@") +add_library(llama UNKNOWN IMPORTED) +set_target_properties(llama + PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES "${_llama_link_deps}" + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${llama_LIBRARY}" + INTERFACE_COMPILE_FEATURES cxx_std_11 + POSITION_INDEPENDENT_CODE ON ) + +check_required_components(Llama) diff --git a/scripts/convert-gg.sh b/scripts/convert-gg.sh new file mode 100644 index 0000000000000000000000000000000000000000..01fda16fd7efc18066e23ae31c748fbedcf2ab6b --- /dev/null +++ b/scripts/convert-gg.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +# LLaMA v1 +python3 convert.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16 + +# LLaMA v2 +python3 convert.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16 + +# Code Llama +python3 convert.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16 + +# Falcon +python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1 +mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf + +python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1 +mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh new file mode 100644 index 0000000000000000000000000000000000000000..98aec3e3ea503fefae1793d67281cd6f97bf7827 --- /dev/null +++ b/scripts/get-wikitext-2.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip diff --git a/scripts/qnt-all.sh b/scripts/qnt-all.sh new file mode 100644 index 0000000000000000000000000000000000000000..b4c2a159e2bf510c96f8e92dd6a4fac7ea07bdaa --- /dev/null +++ b/scripts/qnt-all.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +qnt=(q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k) +args="" + +if [ -z "$1" ]; then + echo "usage: $0 [qnt] [args]" + echo "default: $0 \"${qnt[@]}\" \"${args}\"" + exit 1 +fi + +if [ ! -z "$2" ]; then + qnt=($2) +fi + +if [ ! -z "$3" ]; then + args="$3" +fi + +model="$1" +out="../tmp/results-${model}" + +set -o pipefail +set -e + +mkdir -p ${out} + +for q in ${qnt[@]}; do + time ./bin/quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt +done diff --git a/scripts/run-all-perf.sh b/scripts/run-all-perf.sh new file mode 100644 index 0000000000000000000000000000000000000000..6384e364d558439ad5a7e46ba2f0d7188615c66a --- /dev/null +++ b/scripts/run-all-perf.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k) +args="-ngl 999 -n 64 -p 512" + +if [ -z "$1" ]; then + echo "usage: $0 [qnt] [args]" + echo "default: $0 \"${qnt[@]}\" \"${args}\"" + exit 1 +fi + +if [ ! -z "$2" ]; then + qnt=($2) +fi + +if [ ! -z "$3" ]; then + args="$3" +fi + +model="$1" +out="../tmp/results-${model}" + +set -o pipefail +set -e + +mkdir -p ${out} + +mstr="" + +for q in ${qnt[@]}; do + mstr="${mstr} -m ../models/${model}/ggml-model-${q}.gguf" +done + +./bin/llama-bench ${mstr} ${args} 2> /dev/null diff --git a/scripts/run-all-ppl.sh b/scripts/run-all-ppl.sh new file mode 100644 index 0000000000000000000000000000000000000000..e04d61d7fe0910c60e7f8a05861c0365bad67e84 --- /dev/null +++ b/scripts/run-all-ppl.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k) +args="-ngl 999 -t 8" + +if [ -z "$1" ]; then + echo "usage: $0 [qnt] [args]" + echo "default: $0 \"${qnt[@]}\" \"${args}\"" + exit 1 +fi + +if [ ! -z "$2" ]; then + qnt=($2) +fi + +if [ ! -z "$3" ]; then + args="$3" +fi + +set -o pipefail +set -e + +model="$1" +out="../tmp/results-${model}" + +mkdir -p ${out} + +for q in ${qnt[@]}; do + time ./bin/perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt +done diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h index bdbd12800433242dec1e6dac8c96875540dd19e7..f45456876da621d5beafdc2419d9d575a9fb02ff 100644 --- a/spm-headers/ggml.h +++ b/spm-headers/ggml.h @@ -130,13 +130,16 @@ // The data of the tensor is accessed via the "data" pointer. For example: // // { -// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); +// const int nx = 2; +// const int ny = 3; // -// // a[2, 1] = 1.0f; -// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; +// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); // -// // a[0, 2] = 2.0f; -// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y; +// } +// } // // ... // } @@ -192,6 +195,14 @@ # define GGML_DEPRECATED(func, hint) func #endif +#ifndef __GNUC__ +# define GGML_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif + #include #include #include @@ -207,14 +218,24 @@ #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 -#define GGML_MAX_NAME 48 +#define GGML_MAX_NAME 64 #define GGML_MAX_OP_PARAMS 32 #define GGML_DEFAULT_N_THREADS 4 +#if UINTPTR_MAX == 0xFFFFFFFF + #define GGML_MEM_ALIGN 4 +#else + #define GGML_MEM_ALIGN 16 +#endif #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 +#define GGUF_MAGIC 0x46554747 // "GGUF" +#define GGUF_VERSION 2 + +#define GGUF_DEFAULT_ALIGNMENT 32 + #define GGML_UNUSED(x) (void)(x) #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) @@ -255,8 +276,9 @@ extern "C" { #endif -#ifdef __ARM_NEON - // we use the built-in 16-bit float type +#if defined(__ARM_NEON) && defined(__CUDACC__) + typedef half ggml_fp16_t; +#elif defined(__ARM_NEON) typedef __fp16 ggml_fp16_t; #else typedef uint16_t ggml_fp16_t; @@ -340,10 +362,12 @@ extern "C" { GGML_OP_ARGMAX, GGML_OP_REPEAT, GGML_OP_REPEAT_BACK, + GGML_OP_CONCAT, GGML_OP_SILU_BACK, GGML_OP_NORM, // normalize GGML_OP_RMS_NORM, GGML_OP_RMS_NORM_BACK, + GGML_OP_GROUP_NORM, GGML_OP_MUL_MAT, GGML_OP_OUT_PROD, @@ -369,14 +393,19 @@ extern "C" { GGML_OP_CLAMP, GGML_OP_CONV_1D, GGML_OP_CONV_2D, + GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, + GGML_OP_UPSCALE, // nearest interpolate + GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, GGML_OP_FLASH_ATTN_BACK, GGML_OP_WIN_PART, GGML_OP_WIN_UNPART, + GGML_OP_GET_REL_POS, + GGML_OP_ADD_REL_POS, GGML_OP_UNARY, @@ -458,6 +487,9 @@ extern "C" { int64_t perf_cycles; int64_t perf_time_us; + struct ggml_tensor * view_src; + size_t view_offs; + void * data; char name[GGML_MAX_NAME]; @@ -562,6 +594,7 @@ extern "C" { GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); GGML_API int ggml_blck_size (enum ggml_type type); @@ -639,7 +672,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); - GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); @@ -660,6 +693,7 @@ extern "C" { GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); + GGML_ATTRIBUTE_FORMAT(2, 3) GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); // @@ -799,6 +833,13 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // concat a and b on dim 2 + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_concat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_abs( struct ggml_context * ctx, struct ggml_tensor * a); @@ -888,14 +929,15 @@ extern "C" { struct ggml_tensor * b); // normalize along rows - // TODO: eps is hardcoded to 1e-5 for now GGML_API struct ggml_tensor * ggml_norm( struct ggml_context * ctx, - struct ggml_tensor * a); + struct ggml_tensor * a, + float eps); GGML_API struct ggml_tensor * ggml_norm_inplace( struct ggml_context * ctx, - struct ggml_tensor * a); + struct ggml_tensor * a, + float eps); GGML_API struct ggml_tensor * ggml_rms_norm( struct ggml_context * ctx, @@ -907,13 +949,26 @@ extern "C" { struct ggml_tensor * a, float eps); + // group normalize along ne0*ne1*n_groups + // used in stable-diffusion + // TODO: eps is hardcoded to 1e-6 for now + GGML_API struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + + GGML_API struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + // a - x // b - dy - // TODO: update with configurable eps GGML_API struct ggml_tensor * ggml_rms_norm_back( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + float eps); // A: n columns, m rows // B: n columns, p rows (i.e. we transpose it internally) @@ -1207,6 +1262,15 @@ extern "C" { float freq_base, float freq_scale); + // xPos RoPE, in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + float base, + bool down); + // rotary position embedding backward, i.e compute dx from dy // a - dy GGML_API struct ggml_tensor * ggml_rope_back( @@ -1215,7 +1279,11 @@ extern "C" { int n_past, int n_dims, int mode, - int n_ctx); + int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down); // alibi position embedding // in-place, returns view(a) @@ -1242,6 +1310,15 @@ extern "C" { int p0, // padding int d0); // dilation + // conv_1d with padding = half + // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + GGML_API struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d); + GGML_API struct ggml_tensor * ggml_conv_2d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1253,14 +1330,38 @@ extern "C" { int d0, int d1); - // conv_1d with padding = half - // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) - GGML_API struct ggml_tensor * ggml_conv_1d_ph( + + // kernel size is a->ne[0] x a->ne[1] + // stride is equal to kernel size + // padding is zero + // example: + // a: 16 16 3 768 + // b: 1024 1024 3 1 + // res: 64 64 768 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // kernel size is a->ne[0] x a->ne[1] + // stride is 1 + // padding is half + // example: + // a: 3 3 256 256 + // b: 64 64 256 1 + // res: 64 64 256 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - int s, - int d); + int stride); enum ggml_op_pool { GGML_OP_POOL_MAX, @@ -1287,6 +1388,13 @@ extern "C" { int p0, int p1); + // nearest interpolate + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor); + GGML_API struct ggml_tensor * ggml_flash_attn( struct ggml_context * ctx, struct ggml_tensor * q, @@ -1340,6 +1448,27 @@ extern "C" { struct ggml_tensor * a, enum ggml_unary_op op); + // used in sam + GGML_API struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh); + + // used in sam + + GGML_API struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + // custom operators typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); @@ -1495,7 +1624,8 @@ extern "C" { struct ggml_tensor * tensor); - GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); @@ -1560,6 +1690,8 @@ extern "C" { GGML_LINESEARCH_INVALID_PARAMETERS, }; + typedef void (*ggml_opt_callback)(void * data, float * sched); + // optimization parameters // // see ggml.c (ggml_opt_default_params) for default values @@ -1595,12 +1727,14 @@ extern "C" { float sched; // schedule multiplier (fixed, decay or warmup) float decay; // weight decay for AdamW, use 0.0f to disable + int decay_min_ndim; // minimum number of tensor dimension to apply weight decay float alpha; // learning rate float beta1; float beta2; float eps; // epsilon for numerical stability float eps_f; // epsilon for convergence test float eps_g; // epsilon for convergence test + float gclip; // gradient clipping } adam; // LBFGS parameters @@ -1628,14 +1762,12 @@ extern "C" { bool just_initialized; + float loss_before; + float loss_after; + struct { - struct ggml_tensor * x; // view of the parameters - struct ggml_tensor * g1; // gradient - struct ggml_tensor * g2; // gradient squared struct ggml_tensor * m; // first moment struct ggml_tensor * v; // second moment - struct ggml_tensor * mh; // first moment hat - struct ggml_tensor * vh; // second moment hat struct ggml_tensor * pf; // past function values float fx_best; float fx_prev; @@ -1672,10 +1804,10 @@ extern "C" { // initialize optimizer context GGML_API void ggml_opt_init( - struct ggml_context * ctx, + struct ggml_context * ctx, struct ggml_opt_context * opt, - struct ggml_opt_params params, - int64_t nx); + struct ggml_opt_params params, + int64_t nx); // continue optimizing the function defined by the tensor f GGML_API enum ggml_opt_result ggml_opt_resume( @@ -1689,7 +1821,9 @@ extern "C" { struct ggml_opt_context * opt, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb); + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data); // // quantization @@ -1703,6 +1837,127 @@ extern "C" { GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // + // gguf + // + + enum gguf_type { + GGUF_TYPE_UINT8 = 0, + GGUF_TYPE_INT8 = 1, + GGUF_TYPE_UINT16 = 2, + GGUF_TYPE_INT16 = 3, + GGUF_TYPE_UINT32 = 4, + GGUF_TYPE_INT32 = 5, + GGUF_TYPE_FLOAT32 = 6, + GGUF_TYPE_BOOL = 7, + GGUF_TYPE_STRING = 8, + GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, + GGUF_TYPE_COUNT, // marks the end of the enum + }; + + struct gguf_context; + + struct gguf_init_params { + bool no_alloc; + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API const char * gguf_type_name(enum gguf_type type); + + GGML_API int gguf_get_version (const struct gguf_context * ctx); + GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + GGML_API void * gguf_get_data (const struct gguf_context * ctx); + + GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); + GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); + GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i); + + GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i); + GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i); + + // results are undefined if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i); + GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i); + GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i); + GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i); + GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i); + GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i); + GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i); + GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i); + GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i); + GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i); + GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i); + GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i); + GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); + + GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + + // overrides existing values or adds a new one + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); + GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); + GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); + GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); + GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); + GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); + GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); + GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); + GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); + GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n); + + // set or add KV pairs from another context + GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); + + // manage tensor info + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); + + // writing gguf files can be done in 2 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); + // fwrite(f, ...); + // void * data = gguf_meta_get_meta_data(ctx); + // fseek(f, 0, SEEK_SET); + // fwrite(f, data, gguf_get_meta_size(ctx)); + // free(data); + // fclose(f); + // + + // write the entire context to a binary file + GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); + GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); + // // system info // @@ -1715,6 +1970,7 @@ extern "C" { GGML_API int ggml_cpu_has_fma (void); GGML_API int ggml_cpu_has_neon (void); GGML_API int ggml_cpu_has_arm_fma (void); + GGML_API int ggml_cpu_has_metal (void); GGML_API int ggml_cpu_has_f16c (void); GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_wasm_simd (void); @@ -1723,6 +1979,7 @@ extern "C" { GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); + GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_vsx (void); // @@ -1740,6 +1997,10 @@ extern "C" { typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef struct { + const char * type_name; + int blck_size; + size_t type_size; + bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float; ggml_from_float_t from_float_reference; @@ -1747,7 +2008,7 @@ extern "C" { enum ggml_type vec_dot_type; } ggml_type_traits_t; - ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i); + ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); #ifdef __cplusplus } diff --git a/tests/test-c.c b/tests/test-c.c new file mode 100644 index 0000000000000000000000000000000000000000..a05071080a1dffa4e22ad908371f6b9e2ba8077c --- /dev/null +++ b/tests/test-c.c @@ -0,0 +1,3 @@ +#include "llama.h" + +int main(void) {} diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a0b5b043df868bd3a7c16bd71d205d3a6754da08 --- /dev/null +++ b/tests/test-grammar-parser.cpp @@ -0,0 +1,250 @@ +#ifdef NDEBUG +#undef NDEBUG +#endif + +#include "llama.h" +#include "grammar-parser.h" + +#include + +int main() +{ + grammar_parser::parse_state parsed_grammar; + + const char *grammar_bytes = R"""(root ::= (expr "=" term "\n")+ +expr ::= term ([-+*/] term)* +term ::= [0-9]+)"""; + + parsed_grammar = grammar_parser::parse(grammar_bytes); + + std::vector> expected = { + {"expr", 2}, + {"expr_5", 5}, + {"expr_6", 6}, + {"root", 0}, + {"root_1", 1}, + {"root_4", 4}, + {"term", 3}, + {"term_7", 7}, + }; + + uint32_t index = 0; + for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) + { + std::string key = it->first; + uint32_t value = it->second; + std::pair expected_pair = expected[index]; + + // pretty print error message before asserting + if (expected_pair.first != key || expected_pair.second != value) + { + fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second); + fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value); + fprintf(stderr, "expected_pair != actual_pair\n"); + } + + assert(expected_pair.first == key && expected_pair.second == value); + + index++; + } + std::vector expected_rules = { + {LLAMA_GRETYPE_RULE_REF, 4}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 2}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 10}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_RULE_REF, 6}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 1}, + {LLAMA_GRETYPE_RULE_REF, 4}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_RULE_REF, 1}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_CHAR, 45}, + {LLAMA_GRETYPE_CHAR_ALT, 43}, + {LLAMA_GRETYPE_CHAR_ALT, 42}, + {LLAMA_GRETYPE_CHAR_ALT, 47}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_RULE_REF, 6}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_CHAR, 48}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_CHAR, 48}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57}, + {LLAMA_GRETYPE_END, 0}, + }; + + index = 0; + for (auto rule : parsed_grammar.rules) + { + // compare rule to expected rule + for (uint32_t i = 0; i < rule.size(); i++) + { + llama_grammar_element element = rule[i]; + llama_grammar_element expected_element = expected_rules[index]; + + // pretty print error message before asserting + if (expected_element.type != element.type || expected_element.value != element.value) + { + fprintf(stderr, "index: %d\n", index); + fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value); + fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value); + fprintf(stderr, "expected_element != actual_element\n"); + } + + assert(expected_element.type == element.type && expected_element.value == element.value); + index++; + } + } + + const char *longer_grammar_bytes = R"""( + root ::= (expr "=" ws term "\n")+ + expr ::= term ([-+*/] term)* + term ::= ident | num | "(" ws expr ")" ws + ident ::= [a-z] [a-z0-9_]* ws + num ::= [0-9]+ ws + ws ::= [ \t\n]* + )"""; + + parsed_grammar = grammar_parser::parse(longer_grammar_bytes); + + expected = { + {"expr", 2}, + {"expr_6", 6}, + {"expr_7", 7}, + {"ident", 8}, + {"ident_10", 10}, + {"num", 9}, + {"num_11", 11}, + {"root", 0}, + {"root_1", 1}, + {"root_5", 5}, + {"term", 4}, + {"ws", 3}, + {"ws_12", 12}, + }; + + index = 0; + for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) + { + std::string key = it->first; + uint32_t value = it->second; + std::pair expected_pair = expected[index]; + + // pretty print error message before asserting + if (expected_pair.first != key || expected_pair.second != value) + { + fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second); + fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value); + fprintf(stderr, "expected_pair != actual_pair\n"); + } + + assert(expected_pair.first == key && expected_pair.second == value); + + index++; + } + expected_rules = { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 2}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_RULE_REF, 4}, + {LLAMA_GRETYPE_CHAR, 10}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 4}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 12}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 8}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_RULE_REF, 9}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_CHAR, 40}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_RULE_REF, 2}, + {LLAMA_GRETYPE_CHAR, 41}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 1}, + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_RULE_REF, 1}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_CHAR, 45}, + {LLAMA_GRETYPE_CHAR_ALT, 43}, + {LLAMA_GRETYPE_CHAR_ALT, 42}, + {LLAMA_GRETYPE_CHAR_ALT, 47}, + {LLAMA_GRETYPE_RULE_REF, 4}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 6}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_CHAR, 97}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122}, + {LLAMA_GRETYPE_RULE_REF, 10}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_RULE_REF, 11}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_CHAR, 97}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122}, + {LLAMA_GRETYPE_CHAR_ALT, 48}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57}, + {LLAMA_GRETYPE_CHAR_ALT, 95}, + {LLAMA_GRETYPE_RULE_REF, 10}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_CHAR, 48}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57}, + {LLAMA_GRETYPE_RULE_REF, 11}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_CHAR, 48}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57}, + {LLAMA_GRETYPE_END, 0}, + {LLAMA_GRETYPE_CHAR, 32}, + {LLAMA_GRETYPE_CHAR_ALT, 9}, + {LLAMA_GRETYPE_CHAR_ALT, 10}, + {LLAMA_GRETYPE_RULE_REF, 12}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_END, 0}, + }; + + index = 0; + for (auto rule : parsed_grammar.rules) + { + // compare rule to expected rule + for (uint32_t i = 0; i < rule.size(); i++) + { + llama_grammar_element element = rule[i]; + llama_grammar_element expected_element = expected_rules[index]; + + // pretty print error message before asserting + if (expected_element.type != element.type || expected_element.value != element.value) + { + fprintf(stderr, "index: %d\n", index); + fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value); + fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value); + fprintf(stderr, "expected_element != actual_element\n"); + } + + assert(expected_element.type == element.type && expected_element.value == element.value); + index++; + } + } + + return 0; +} diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp new file mode 100644 index 0000000000000000000000000000000000000000..73dd33dd286a5c0d3123ed88badd70d1b1f6b975 --- /dev/null +++ b/tests/test-llama-grammar.cpp @@ -0,0 +1,403 @@ +#ifdef NDEBUG +#undef NDEBUG +#endif + +#include "llama.cpp" // TODO: not great +#include "grammar-parser.h" + +#include + +int main() +{ + grammar_parser::parse_state parsed_grammar; + + std::vector> expected = { + {"expr", 2}, + {"expr_6", 6}, + {"expr_7", 7}, + {"ident", 8}, + {"ident_10", 10}, + {"num", 9}, + {"num_11", 11}, + {"root", 0}, + {"root_1", 1}, + {"root_5", 5}, + {"term", 4}, + {"ws", 3}, + {"ws_12", 12}, + }; + + std::vector> expected_rules = { + {{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}}, + { + {LLAMA_GRETYPE_RULE_REF, 2}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_RULE_REF, 4}, + {LLAMA_GRETYPE_CHAR, 10}, + {LLAMA_GRETYPE_END, 0}, + }, + {{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}}, + {{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}}, + { + {LLAMA_GRETYPE_RULE_REF, 8}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_RULE_REF, 9}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_CHAR, 40}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_RULE_REF, 2}, + {LLAMA_GRETYPE_CHAR, 41}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_END, 0}, + }, + {{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}}, + { + {LLAMA_GRETYPE_CHAR, 45}, + {LLAMA_GRETYPE_CHAR_ALT, 43}, + {LLAMA_GRETYPE_CHAR_ALT, 42}, + {LLAMA_GRETYPE_CHAR_ALT, 47}, + {LLAMA_GRETYPE_RULE_REF, 4}, + {LLAMA_GRETYPE_END, 0}, + }, + {{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}}, + { + {LLAMA_GRETYPE_CHAR, 97}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122}, + {LLAMA_GRETYPE_RULE_REF, 10}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_END, 0}, + }, + {{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}}, + { + {LLAMA_GRETYPE_CHAR, 97}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122}, + {LLAMA_GRETYPE_CHAR_ALT, 48}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57}, + {LLAMA_GRETYPE_CHAR_ALT, 95}, + {LLAMA_GRETYPE_RULE_REF, 10}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_END, 0}, + }, + { + {LLAMA_GRETYPE_CHAR, 48}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57}, + {LLAMA_GRETYPE_RULE_REF, 11}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_CHAR, 48}, + {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57}, + {LLAMA_GRETYPE_END, 0}, + }, + { + {LLAMA_GRETYPE_CHAR, 32}, + {LLAMA_GRETYPE_CHAR_ALT, 9}, + {LLAMA_GRETYPE_CHAR_ALT, 10}, + {LLAMA_GRETYPE_RULE_REF, 12}, + {LLAMA_GRETYPE_ALT, 0}, + {LLAMA_GRETYPE_END, 0}, + }, + }; + + for (auto pair : expected) + { + parsed_grammar.symbol_ids[pair.first] = pair.second; + } + + for (auto rule : expected_rules) + { + parsed_grammar.rules.push_back({}); + for (auto element : rule) + { + parsed_grammar.rules.back().push_back(element); + } + } + + llama_grammar *grammar = NULL; + std::vector grammar_rules(parsed_grammar.c_rules()); + grammar = llama_grammar_init( + grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + + std::vector> expected_stacks = { + { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_CHAR, 97}, + }, + { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_CHAR, 40}, + }, + { + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_CHAR, 97}, + }, + { + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_CHAR, 40}, + }}; + + auto index = 0; + for (auto stack : grammar->stacks) + { + // compare stack to expected_stack + for (uint32_t i = 0; i < stack.size(); i++) + { + auto element = stack[i]; + auto expected_element = expected_stacks[index][i]; + + // pretty print error message before asserting + if (expected_element.type != element->type || expected_element.value != element->value) + { + fprintf(stderr, "index: %d\n", index); + fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value); + fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value); + fprintf(stderr, "expected_element != actual_element\n"); + } + + assert(expected_element.type == element->type && expected_element.value == element->value); + } + index++; + } + + std::vector> next_stacks; + std::vector next_candidates; + next_candidates.resize(24); + + for (size_t i = 0; i < 24; ++i) + { + uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point + cp[0] = 37 + i; + cp[1] = 0; + next_candidates[i] = {i, cp, {}}; + } + + std::vector>> expected_reject = { + { + {0, 37}, + {1, 38}, + {2, 39}, + {3, 40}, + {4, 41}, + {5, 42}, + {6, 43}, + {7, 44}, + {8, 45}, + {9, 46}, + {10, 47}, + {11, 48}, + {12, 49}, + {13, 50}, + {14, 51}, + {15, 52}, + {16, 53}, + {17, 54}, + {18, 55}, + {19, 56}, + {20, 57}, + {21, 58}, + {22, 59}, + {23, 60}, + }, + { + {0, 37}, + {1, 38}, + {2, 39}, + {3, 40}, + {4, 41}, + {5, 42}, + {6, 43}, + {7, 44}, + {8, 45}, + {9, 46}, + {10, 47}, + {21, 58}, + {22, 59}, + {23, 60}, + }, + { + {0, 37}, + {1, 38}, + {2, 39}, + {3, 40}, + {4, 41}, + {5, 42}, + {6, 43}, + {7, 44}, + {8, 45}, + {9, 46}, + {10, 47}, + {21, 58}, + {22, 59}, + {23, 60}, + }, + { + {0, 37}, + {1, 38}, + {2, 39}, + {4, 41}, + {5, 42}, + {6, 43}, + {7, 44}, + {8, 45}, + {9, 46}, + {10, 47}, + {11, 48}, + {12, 49}, + {13, 50}, + {14, 51}, + {15, 52}, + {16, 53}, + {17, 54}, + {18, 55}, + {19, 56}, + {20, 57}, + {21, 58}, + {22, 59}, + {23, 60}, + }, + { + {0, 37}, + {1, 38}, + {2, 39}, + {3, 40}, + {4, 41}, + {5, 42}, + {6, 43}, + {7, 44}, + {8, 45}, + {9, 46}, + {10, 47}, + {11, 48}, + {12, 49}, + {13, 50}, + {14, 51}, + {15, 52}, + {16, 53}, + {17, 54}, + {18, 55}, + {19, 56}, + {20, 57}, + {21, 58}, + {22, 59}, + {23, 60}, + }, + { + {0, 37}, + {1, 38}, + {2, 39}, + {3, 40}, + {4, 41}, + {5, 42}, + {6, 43}, + {7, 44}, + {8, 45}, + {9, 46}, + {10, 47}, + {21, 58}, + {22, 59}, + {23, 60}, + }, + { + {0, 37}, + {1, 38}, + {2, 39}, + {3, 40}, + {4, 41}, + {5, 42}, + {6, 43}, + {7, 44}, + {8, 45}, + {9, 46}, + {10, 47}, + {21, 58}, + {22, 59}, + {23, 60}, + }, + { + {0, 37}, + {1, 38}, + {2, 39}, + {4, 41}, + {5, 42}, + {6, 43}, + {7, 44}, + {8, 45}, + {9, 46}, + {10, 47}, + {11, 48}, + {12, 49}, + {13, 50}, + {14, 51}, + {15, 52}, + {16, 53}, + {17, 54}, + {18, 55}, + {19, 56}, + {20, 57}, + {21, 58}, + {22, 59}, + {23, 60}, + }, + }; + + std::vector rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates); + + std::vector> all_rejects; + + for (std::size_t count = 0; count < grammar->stacks.size(); ++count) + { + rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates); + all_rejects.push_back(rejects); + } + + index = 0; + for (auto rej : all_rejects) + { + for (uint32_t i = 0; i < rej.size(); i++) + { + auto element = rej[i]; + auto expected_element = expected_reject[index][i]; + assert(element.index == expected_element.first && *element.code_points == expected_element.second); + } + index++; + } + + for (auto &candidate : next_candidates) + { + delete[] candidate.code_points; + candidate.code_points = nullptr; + } + delete grammar; + return 0; +} diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp new file mode 100644 index 0000000000000000000000000000000000000000..836fb8ad271092e9e91dfface487546fe8492ecc --- /dev/null +++ b/tests/test-tokenizer-0-falcon.cpp @@ -0,0 +1,178 @@ +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-falcon.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 204, }, }, + { " " , { 258, }, }, + { " " , { 466, }, }, + { "\t" , { 192, }, }, + { "\n" , { 193, }, }, + { "\t\n" , { 19125, }, }, + { "Hello world" , { 9856, 1079, }, }, + { " Hello world" , { 23090, 1079, }, }, + { "Hello World" , { 9856, 2889, }, }, + { " Hello World" , { 23090, 2889, }, }, + { " Hello World!" , { 23090, 2889, 12, }, }, + { "Hello, world!" , { 9856, 23, 1079, 12, }, }, + { " Hello, world!" , { 23090, 23, 1079, 12, }, }, + { " this is 🦙.cpp" , { 414, 304, 3346, 111, 231, 25, 29247, }, }, + { "w048 7tuijk dsdfhu" , { 98, 55866, 204, 34, 16682, 7149, 36190, 6869, 11481, }, }, + { "нещо на Български" , { 150, 133, 6207, 151, 215, 150, 134, 5052, 133, 6279, 5052, 223, 151, 216, 49679, 123, 53110, 47043, 7795, }, }, + { "កាន់តែពិសេសអាចខលចេញ" , { 38154, 206, 38154, 126, 38154, 225, 167, 237, 217, 38154, 221, 167, 237, 208, 38154, 228, 38154, 127, 38154, 237, 167, 237, 207, 38154, 237, 38154, 107, 38154, 126, 38154, 211, 38154, 207, 38154, 233, 38154, 211, 167, 237, 207, 38154, 215, }, }, + { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 2571, 232, 206, 204, 19, 11003, 20, 8196, 126, 283, 219, 48778, 116, 13392, 204, 19, 51831, 732, 63209, 1741, 7955, 522, 20, 22438, 211, 204, 19, 7927, 53360, 325, 504, 701, 946, 10930, 20, }, }, + { "Hello" , { 9856, }, }, + { " Hello" , { 23090, }, }, + { " Hello" , { 204, 23090, }, }, + { " Hello" , { 258, 23090, }, }, + { " Hello" , { 466, 23090, }, }, + { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) { + fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); + llama_free_model(model); + llama_free(ctx); + return 2; + } + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_bpe(ctx, res).c_str(), + llama_detokenize_bpe(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py new file mode 100644 index 0000000000000000000000000000000000000000..9c8c1c7d1d3ca476192d42eb49ff8c10356f2664 --- /dev/null +++ b/tests/test-tokenizer-0-falcon.py @@ -0,0 +1,83 @@ +# tests with BPE tokenizer + +import os +import sys +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is 🦙.cpp", + "w048 7tuijk dsdfhu", + "нещо на Български", + "កាន់តែពិសេសអាចខលចេញ", + "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + +for text in tests: + print('text: ', text) + print(tokenizer.encode(text)) + print(tokenizer.decode(tokenizer.encode(text))) + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dfb2e81a9bc6ff19368c4aa4a3e5d61dc7056ea7 --- /dev/null +++ b/tests/test-tokenizer-0-llama.cpp @@ -0,0 +1,190 @@ +#include "llama.h" +#include "common.h" +#include "console.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-llama.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 259, }, }, + { " " , { 1678, }, }, + { " " , { 268, }, }, + { "\t" , { 29871, 12, }, }, + { "\n" , { 29871, 13, }, }, + { "\t\n" , { 29871, 12, 13, }, }, + { "Hello world" , { 15043, 3186, }, }, + { " Hello world" , { 29871, 15043, 3186, }, }, + { "Hello World" , { 15043, 2787, }, }, + { " Hello World" , { 29871, 15043, 2787, }, }, + { " Hello World!" , { 29871, 15043, 2787, 29991, }, }, + { "Hello, world!" , { 15043, 29892, 3186, 29991, }, }, + { " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, }, + { " this is 🦙.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + { "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + { "нещо на Български" , { 1538, 4851, 665, 1386, 29713, 1305, }, }, + { "កាន់តែពិសេសអាចខលចេញ" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, }, + { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, + { "Hello" , { 15043, }, }, + { " Hello" , { 29871, 15043, }, }, + { " Hello" , { 259, 15043, }, }, + { " Hello" , { 1678, 15043, }, }, + { " Hello" , { 268, 15043, }, }, + { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, + { " (" , { 29871, 313, }, }, + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) { + fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); + llama_free_model(model); + llama_free(ctx); + return 2; + } + +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res_bos = llama_tokenize(ctx, test_kv.first, true); + const std::vector res_nobos = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str()); + printf("tok: "); + for (const auto & tok : res_bos) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1; + + for (int i = 0; i < (int) res_nobos.size() && correct; ++i) { + if (test_kv.second[i] != res_bos[i + 1]) { + correct = false; + } + if (test_kv.second[i] != res_nobos[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_spm(ctx, res_nobos).c_str(), + llama_detokenize_spm(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res_nobos) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py new file mode 100644 index 0000000000000000000000000000000000000000..bc164ee296cb1d6ff1ee28469ee913b7c345e12e --- /dev/null +++ b/tests/test-tokenizer-0-llama.py @@ -0,0 +1,95 @@ +# tests with SPM tokenizer + +import os +import sys +import argparse + +from sentencepiece import SentencePieceProcessor + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is 🦙.cpp", + "w048 7tuijk dsdfhu", + "нещо на Български", + "កាន់តែពិសេសអាចខលចេញ", + "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + + +for text in tests: + print('text: ', text) + print('\nwith bos:') + print(tokenizer.encode(text, add_bos=True)) + print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) + print('\nwithout bos:') + print(tokenizer.encode(text, add_bos=False)) + print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) + +print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' +print("'" + tokenizer.id_to_piece(29871) + "'") # '_' +print("'" + tokenizer.decode([15043]) + "'") # 'Hello' +print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' +print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' +print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text, add_bos=False) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s, add_bos=True) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a95d462cfcd0b181c54950ff5697c3ba5c4456f0 --- /dev/null +++ b/tests/test-tokenizer-1-llama.cpp @@ -0,0 +1,125 @@ +#include "llama.h" +#include "common.h" +#include "console.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef int codepoint; + +static std::string codepoint_to_utf8(codepoint cp) { + std::string result; + if (0x00 <= cp && cp <= 0x7f) { + result.push_back(cp); + } else if (0x80 <= cp && cp <= 0x7ff) { + result.push_back(0xc0 | ((cp >> 6) & 0x1f)); + result.push_back(0x80 | (cp & 0x3f)); + } else if (0x800 <= cp && cp <= 0xffff) { + result.push_back(0xe0 | ((cp >> 12) & 0x0f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } else if (0x10000 <= cp && cp <= 0x10ffff) { + result.push_back(0xf0 | ((cp >> 18) & 0x07)); + result.push_back(0x80 | ((cp >> 12) & 0x3f)); + result.push_back(0x80 | ((cp >> 6) & 0x3f)); + result.push_back(0x80 | (cp & 0x3f)); + } else { + throw std::invalid_argument("invalid codepoint"); + } + return result; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM); + +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + + const int n_vocab = llama_n_vocab(ctx); + + for (int i = 0; i < n_vocab; ++i) { + std::string str = llama_detokenize_spm(ctx, std::vector(1, i)); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (check != str) { + fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n", + __func__, i, str.c_str(), str.length(), check.c_str(), check.length()); + return 2; + } + } + + for (codepoint cp = 0x0000; cp < 0xffff; ++cp) { + if (cp < 0xd800 || cp > 0xdfff) { + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (cp != 9601 && str != check) { + fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); + return 3; + } + } + } + for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) { + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_spm(ctx, tokens); + if (str != check) { + fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); + return 4; + } + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return 0; +}