Illumotion commited on
Commit
f57d7c6
1 Parent(s): 411033d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .devops/cloud-v-pipeline +22 -0
  2. .devops/full-cuda.Dockerfile +1 -1
  3. .devops/full-rocm.Dockerfile +44 -0
  4. .devops/llama-cpp-clblast.srpm.spec +84 -0
  5. .devops/llama-cpp-cublas.srpm.spec +83 -0
  6. .devops/llama-cpp.srpm.spec +85 -0
  7. .devops/main-cuda.Dockerfile +1 -1
  8. .devops/main-rocm.Dockerfile +44 -0
  9. .editorconfig +3 -0
  10. .github/workflows/code-coverage.yml +36 -0
  11. .github/workflows/gguf-publish.yml +43 -0
  12. .gitignore +27 -30
  13. CMakeLists.txt +116 -20
  14. Dockerfile +2 -1
  15. MIT_LICENSE_GGML_LLAMACPP_ONLY +1 -1
  16. Makefile +89 -38
  17. Package.swift +35 -4
  18. README.md +1 -1
  19. build-info.h +2 -0
  20. ci/run.sh +141 -44
  21. class.py +313 -0
  22. codecov.yml +14 -0
  23. colab.ipynb +61 -0
  24. common/CMakeLists.txt +20 -0
  25. common/common.cpp +1270 -0
  26. common/common.h +211 -0
  27. common/console.cpp +501 -0
  28. common/console.h +19 -0
  29. common/grammar-parser.cpp +424 -0
  30. common/grammar-parser.h +29 -0
  31. common/log.h +643 -0
  32. convert-baichuan-hf-to-gguf.py +304 -0
  33. convert-falcon-hf-to-gguf.py +281 -0
  34. convert-gptneox-hf-to-gguf.py +251 -0
  35. convert-llama-ggml-to-gguf.py +451 -0
  36. convert-lora-to-ggml.py +22 -19
  37. convert-starcoder-hf-to-gguf.py +248 -0
  38. convert.py +638 -756
  39. docs/token_generation_performance_tips.md +3 -3
  40. examples/CMakeLists.txt +4 -21
  41. examples/baby-llama/baby-llama.cpp +77 -76
  42. examples/beam-search/CMakeLists.txt +5 -0
  43. examples/beam-search/beam-search.cpp +186 -0
  44. examples/benchmark/CMakeLists.txt +2 -1
  45. examples/benchmark/benchmark-matmult.cpp +23 -20
  46. examples/chat.sh +1 -1
  47. examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  48. examples/convert-llama2c-to-ggml/README.md +26 -0
  49. examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +963 -0
  50. examples/embd-input/embd-input-lib.cpp +8 -11
.devops/cloud-v-pipeline ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
2
+ stage('Cleanup'){
3
+ cleanWs() // Cleaning previous CI build in workspace
4
+ }
5
+ stage('checkout repo'){
6
+ retry(5){ // Retry if the cloning fails due to some reason
7
+ checkout scm // Clone the repo on Runner
8
+ }
9
+ }
10
+ stage('Compiling llama.cpp'){
11
+ sh'''#!/bin/bash
12
+ make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
13
+ '''
14
+ }
15
+ stage('Running llama.cpp'){
16
+ sh'''#!/bin/bash
17
+ module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
18
+ qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
19
+ cat llama_log.txt # Printing results
20
+ '''
21
+ }
22
+ }
.devops/full-cuda.Dockerfile CHANGED
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
12
  ARG CUDA_DOCKER_ARCH=all
13
 
14
  RUN apt-get update && \
15
- apt-get install -y build-essential python3 python3-pip
16
 
17
  COPY requirements.txt requirements.txt
18
 
 
12
  ARG CUDA_DOCKER_ARCH=all
13
 
14
  RUN apt-get update && \
15
+ apt-get install -y build-essential python3 python3-pip git
16
 
17
  COPY requirements.txt requirements.txt
18
 
.devops/full-rocm.Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+
3
+ # This needs to generally match the container host's environment.
4
+ ARG ROCM_VERSION=5.6
5
+
6
+ # Target the CUDA build image
7
+ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
8
+
9
+ FROM ${BASE_ROCM_DEV_CONTAINER} as build
10
+
11
+ # Unless otherwise specified, we make a fat build.
12
+ # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
13
+ # This is mostly tied to rocBLAS supported archs.
14
+ ARG ROCM_DOCKER_ARCH=\
15
+ gfx803 \
16
+ gfx900 \
17
+ gfx906 \
18
+ gfx908 \
19
+ gfx90a \
20
+ gfx1010 \
21
+ gfx1030 \
22
+ gfx1100 \
23
+ gfx1101 \
24
+ gfx1102
25
+
26
+ COPY requirements.txt requirements.txt
27
+
28
+ RUN pip install --upgrade pip setuptools wheel \
29
+ && pip install -r requirements.txt
30
+
31
+ WORKDIR /app
32
+
33
+ COPY . .
34
+
35
+ # Set nvcc architecture
36
+ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37
+ # Enable ROCm
38
+ ENV LLAMA_HIPBLAS=1
39
+ ENV CC=/opt/rocm/llvm/bin/clang
40
+ ENV CXX=/opt/rocm/llvm/bin/clang++
41
+
42
+ RUN make
43
+
44
+ ENTRYPOINT ["/app/.devops/tools.sh"]
.devops/llama-cpp-clblast.srpm.spec ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SRPM for building from source and packaging an RPM for RPM-based distros.
2
+ # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
3
+ # Built and maintained by John Boero - [email protected]
4
+ # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
5
+
6
+ # Notes for llama.cpp:
7
+ # 1. Tags are currently based on hash - which will not sort asciibetically.
8
+ # We need to declare standard versioning if people want to sort latest releases.
9
+ # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
10
+ # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
11
+ # Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
12
+ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
13
+ # It is up to the user to install the correct vendor-specific support.
14
+
15
+ Name: llama.cpp-clblast
16
+ Version: %( date "+%%Y%%m%%d" )
17
+ Release: 1%{?dist}
18
+ Summary: OpenCL Inference of LLaMA model in C/C++
19
+ License: MIT
20
+ Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
21
+ BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
22
+ Requires: clblast
23
+ URL: https://github.com/ggerganov/llama.cpp
24
+
25
+ %define debug_package %{nil}
26
+ %define source_date_epoch_from_changelog 0
27
+
28
+ %description
29
+ CPU inference for Meta's Lllama2 models using default options.
30
+
31
+ %prep
32
+ %setup -n llama.cpp-master
33
+
34
+ %build
35
+ make -j LLAMA_CLBLAST=1
36
+
37
+ %install
38
+ mkdir -p %{buildroot}%{_bindir}/
39
+ cp -p main %{buildroot}%{_bindir}/llamaclblast
40
+ cp -p server %{buildroot}%{_bindir}/llamaclblastserver
41
+ cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
42
+
43
+ mkdir -p %{buildroot}/usr/lib/systemd/system
44
+ %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
45
+ [Unit]
46
+ Description=Llama.cpp server, CPU only (no GPU support in this build).
47
+ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
48
+
49
+ [Service]
50
+ Type=simple
51
+ EnvironmentFile=/etc/sysconfig/llama
52
+ ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
53
+ ExecReload=/bin/kill -s HUP $MAINPID
54
+ Restart=never
55
+
56
+ [Install]
57
+ WantedBy=default.target
58
+ EOF
59
+
60
+ mkdir -p %{buildroot}/etc/sysconfig
61
+ %{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
62
+ LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
63
+ EOF
64
+
65
+ %clean
66
+ rm -rf %{buildroot}
67
+ rm -rf %{_builddir}/*
68
+
69
+ %files
70
+ %{_bindir}/llamaclblast
71
+ %{_bindir}/llamaclblastserver
72
+ %{_bindir}/llamaclblastsimple
73
+ /usr/lib/systemd/system/llamaclblast.service
74
+ %config /etc/sysconfig/llama
75
+
76
+
77
+ %pre
78
+
79
+ %post
80
+
81
+ %preun
82
+ %postun
83
+
84
+ %changelog
.devops/llama-cpp-cublas.srpm.spec ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SRPM for building from source and packaging an RPM for RPM-based distros.
2
+ # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
3
+ # Built and maintained by John Boero - [email protected]
4
+ # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
5
+
6
+ # Notes for llama.cpp:
7
+ # 1. Tags are currently based on hash - which will not sort asciibetically.
8
+ # We need to declare standard versioning if people want to sort latest releases.
9
+ # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
10
+ # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
11
+ # Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
12
+ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
13
+ # It is up to the user to install the correct vendor-specific support.
14
+
15
+ Name: llama.cpp-cublas
16
+ Version: %( date "+%%Y%%m%%d" )
17
+ Release: 1%{?dist}
18
+ Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
19
+ License: MIT
20
+ Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
21
+ BuildRequires: coreutils make gcc-c++ git cuda-toolkit
22
+ Requires: cuda-toolkit
23
+ URL: https://github.com/ggerganov/llama.cpp
24
+
25
+ %define debug_package %{nil}
26
+ %define source_date_epoch_from_changelog 0
27
+
28
+ %description
29
+ CPU inference for Meta's Lllama2 models using default options.
30
+
31
+ %prep
32
+ %setup -n llama.cpp-master
33
+
34
+ %build
35
+ make -j LLAMA_CUBLAS=1
36
+
37
+ %install
38
+ mkdir -p %{buildroot}%{_bindir}/
39
+ cp -p main %{buildroot}%{_bindir}/llamacppcublas
40
+ cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
41
+ cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
42
+
43
+ mkdir -p %{buildroot}/usr/lib/systemd/system
44
+ %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
45
+ [Unit]
46
+ Description=Llama.cpp server, CPU only (no GPU support in this build).
47
+ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
48
+
49
+ [Service]
50
+ Type=simple
51
+ EnvironmentFile=/etc/sysconfig/llama
52
+ ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
53
+ ExecReload=/bin/kill -s HUP $MAINPID
54
+ Restart=never
55
+
56
+ [Install]
57
+ WantedBy=default.target
58
+ EOF
59
+
60
+ mkdir -p %{buildroot}/etc/sysconfig
61
+ %{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
62
+ LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
63
+ EOF
64
+
65
+ %clean
66
+ rm -rf %{buildroot}
67
+ rm -rf %{_builddir}/*
68
+
69
+ %files
70
+ %{_bindir}/llamacppcublas
71
+ %{_bindir}/llamacppcublasserver
72
+ %{_bindir}/llamacppcublassimple
73
+ /usr/lib/systemd/system/llamacublas.service
74
+ %config /etc/sysconfig/llama
75
+
76
+ %pre
77
+
78
+ %post
79
+
80
+ %preun
81
+ %postun
82
+
83
+ %changelog
.devops/llama-cpp.srpm.spec ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SRPM for building from source and packaging an RPM for RPM-based distros.
2
+ # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
3
+ # Built and maintained by John Boero - [email protected]
4
+ # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
5
+
6
+ # Notes for llama.cpp:
7
+ # 1. Tags are currently based on hash - which will not sort asciibetically.
8
+ # We need to declare standard versioning if people want to sort latest releases.
9
+ # In the meantime, YYYYMMDD format will be used.
10
+ # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
11
+ # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
12
+ # Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
13
+ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
14
+ # It is up to the user to install the correct vendor-specific support.
15
+
16
+ Name: llama.cpp
17
+ Version: %( date "+%%Y%%m%%d" )
18
+ Release: 1%{?dist}
19
+ Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
20
+ License: MIT
21
+ Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
22
+ BuildRequires: coreutils make gcc-c++ git libstdc++-devel
23
+ Requires: libstdc++
24
+ URL: https://github.com/ggerganov/llama.cpp
25
+
26
+ %define debug_package %{nil}
27
+ %define source_date_epoch_from_changelog 0
28
+
29
+ %description
30
+ CPU inference for Meta's Lllama2 models using default options.
31
+ Models are not included in this package and must be downloaded separately.
32
+
33
+ %prep
34
+ %setup -n llama.cpp-master
35
+
36
+ %build
37
+ make -j
38
+
39
+ %install
40
+ mkdir -p %{buildroot}%{_bindir}/
41
+ cp -p main %{buildroot}%{_bindir}/llama
42
+ cp -p server %{buildroot}%{_bindir}/llamaserver
43
+ cp -p simple %{buildroot}%{_bindir}/llamasimple
44
+
45
+ mkdir -p %{buildroot}/usr/lib/systemd/system
46
+ %{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
47
+ [Unit]
48
+ Description=Llama.cpp server, CPU only (no GPU support in this build).
49
+ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
50
+
51
+ [Service]
52
+ Type=simple
53
+ EnvironmentFile=/etc/sysconfig/llama
54
+ ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
55
+ ExecReload=/bin/kill -s HUP $MAINPID
56
+ Restart=never
57
+
58
+ [Install]
59
+ WantedBy=default.target
60
+ EOF
61
+
62
+ mkdir -p %{buildroot}/etc/sysconfig
63
+ %{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
64
+ LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
65
+ EOF
66
+
67
+ %clean
68
+ rm -rf %{buildroot}
69
+ rm -rf %{_builddir}/*
70
+
71
+ %files
72
+ %{_bindir}/llama
73
+ %{_bindir}/llamaserver
74
+ %{_bindir}/llamasimple
75
+ /usr/lib/systemd/system/llama.service
76
+ %config /etc/sysconfig/llama
77
+
78
+ %pre
79
+
80
+ %post
81
+
82
+ %preun
83
+ %postun
84
+
85
+ %changelog
.devops/main-cuda.Dockerfile CHANGED
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
12
  ARG CUDA_DOCKER_ARCH=all
13
 
14
  RUN apt-get update && \
15
- apt-get install -y build-essential
16
 
17
  WORKDIR /app
18
 
 
12
  ARG CUDA_DOCKER_ARCH=all
13
 
14
  RUN apt-get update && \
15
+ apt-get install -y build-essential git
16
 
17
  WORKDIR /app
18
 
.devops/main-rocm.Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+
3
+ # This needs to generally match the container host's environment.
4
+ ARG ROCM_VERSION=5.6
5
+
6
+ # Target the CUDA build image
7
+ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
8
+
9
+ FROM ${BASE_ROCM_DEV_CONTAINER} as build
10
+
11
+ # Unless otherwise specified, we make a fat build.
12
+ # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
13
+ # This is mostly tied to rocBLAS supported archs.
14
+ ARG ROCM_DOCKER_ARCH=\
15
+ gfx803 \
16
+ gfx900 \
17
+ gfx906 \
18
+ gfx908 \
19
+ gfx90a \
20
+ gfx1010 \
21
+ gfx1030 \
22
+ gfx1100 \
23
+ gfx1101 \
24
+ gfx1102
25
+
26
+ COPY requirements.txt requirements.txt
27
+
28
+ RUN pip install --upgrade pip setuptools wheel \
29
+ && pip install -r requirements.txt
30
+
31
+ WORKDIR /app
32
+
33
+ COPY . .
34
+
35
+ # Set nvcc architecture
36
+ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37
+ # Enable ROCm
38
+ ENV LLAMA_HIPBLAS=1
39
+ ENV CC=/opt/rocm/llvm/bin/clang
40
+ ENV CXX=/opt/rocm/llvm/bin/clang++
41
+
42
+ RUN make
43
+
44
+ ENTRYPOINT [ "/app/main" ]
.editorconfig CHANGED
@@ -17,3 +17,6 @@ indent_style = tab
17
 
18
  [prompts/*.txt]
19
  insert_final_newline = unset
 
 
 
 
17
 
18
  [prompts/*.txt]
19
  insert_final_newline = unset
20
+
21
+ [examples/server/public/*]
22
+ indent_size = 2
.github/workflows/code-coverage.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Code Coverage
2
+ on: [push, pull_request]
3
+
4
+ env:
5
+ GGML_NLOOP: 3
6
+ GGML_N_THREADS: 1
7
+
8
+ jobs:
9
+ run:
10
+ runs-on: ubuntu-20.04
11
+ steps:
12
+ - name: Checkout
13
+ uses: actions/checkout@v3
14
+
15
+ - name: Dependencies
16
+ run: |
17
+ sudo apt-get update
18
+ sudo apt-get install build-essential gcc-8 lcov
19
+
20
+ - name: Build
21
+ run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
22
+
23
+ - name: Run tests
24
+ run: CC=gcc-8 make test
25
+
26
+ - name: Generate coverage report
27
+ run: |
28
+ make coverage
29
+ make lcov-report
30
+
31
+ - name: Upload coverage to Codecov
32
+ uses: codecov/codecov-action@v3
33
+ env:
34
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
35
+ with:
36
+ files: lcov-report/coverage.info
.github/workflows/gguf-publish.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will upload a Python Package using Twine when a GGUF release is created
2
+ # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3
+
4
+ # See `gguf-py/README.md` for how to make a release.
5
+
6
+ # This workflow uses actions that are not certified by GitHub.
7
+ # They are provided by a third-party and are governed by
8
+ # separate terms of service, privacy policy, and support
9
+ # documentation.
10
+
11
+ name: Upload Python Package
12
+
13
+ on:
14
+ workflow_dispatch:
15
+ push:
16
+ # Pattern matched against refs/tags
17
+ tags:
18
+ - 'gguf-v*' # Push events to every version tag
19
+
20
+
21
+ jobs:
22
+ deploy:
23
+
24
+ runs-on: ubuntu-latest
25
+
26
+ steps:
27
+ - uses: actions/checkout@v3
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v2
30
+ with:
31
+ python-version: '3.9.x'
32
+ - name: Install dependencies
33
+ run: |
34
+ cd gguf-py
35
+ python -m pip install poetry
36
+ poetry install
37
+
38
+ - name: Build package
39
+ run: poetry build
40
+ - name: Publish package
41
+ uses: pypa/gh-action-pypi-publish@release/v1
42
+ with:
43
+ password: ${{ secrets.PYPI_API_TOKEN }}
.gitignore CHANGED
@@ -1,6 +1,6 @@
1
  *.o
2
  *.a
3
- *.so
4
  .DS_Store
5
  .build/
6
  .cache/
@@ -12,20 +12,7 @@
12
  .vs/
13
  .vscode/
14
 
15
- build/
16
- build-em/
17
- build-debug/
18
- build-release/
19
- build-ci-debug/
20
- build-ci-release/
21
- build-static/
22
- build-cublas/
23
- build-opencl/
24
- build-metal/
25
- build-mpi/
26
- build-no-accel/
27
- build-sanitize-addr/
28
- build-sanitize-thread/
29
  out/
30
  tmp/
31
 
@@ -39,19 +26,24 @@ models-mnt
39
  /perplexity
40
  /embedding
41
  /train-text-from-scratch
 
42
  /simple
43
  /benchmark-matmult
44
  /vdot
45
  /server
46
  /Pipfile
47
  /embd-input-test
 
 
48
  /libllama.so
49
-
 
50
  arm_neon.h
51
  compile_commands.json
52
  CMakeSettings.json
53
 
54
  __pycache__
 
55
 
56
  dist/
57
  *.spec
@@ -65,11 +57,11 @@ perf-*.txt
65
 
66
  examples/jeopardy/results.txt
67
 
68
- pyproject.toml
69
  poetry.lock
70
  poetry.toml
71
 
72
  # Test binaries
 
73
  tests/test-double-float
74
  tests/test-grad0
75
  tests/test-opt
@@ -78,16 +70,21 @@ tests/test-quantize-perf
78
  tests/test-sampling
79
  tests/test-tokenizer-0
80
 
81
- koboldcpp.so
82
- koboldcpp_failsafe.so
83
- koboldcpp_openblas.so
84
- koboldcpp_noavx2.so
85
- koboldcpp_clblast.so
86
- koboldcpp.dll
87
- koboldcpp_failsafe.dll
88
- koboldcpp_openblas.dll
89
- koboldcpp_noavx2.dll
90
- koboldcpp_clblast.dll
91
- koboldcpp_cublas.dll
92
- cublas64_11.dll
93
- cublasLt64_11.dll
 
 
 
 
 
 
1
  *.o
2
  *.a
3
+ *.bin
4
  .DS_Store
5
  .build/
6
  .cache/
 
12
  .vs/
13
  .vscode/
14
 
15
+ build*/
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  out/
17
  tmp/
18
 
 
26
  /perplexity
27
  /embedding
28
  /train-text-from-scratch
29
+ /convert-llama2c-to-ggml
30
  /simple
31
  /benchmark-matmult
32
  /vdot
33
  /server
34
  /Pipfile
35
  /embd-input-test
36
+ /gguf
37
+ /gguf-llama-simple
38
  /libllama.so
39
+ /llama-bench
40
+ build-info.h
41
  arm_neon.h
42
  compile_commands.json
43
  CMakeSettings.json
44
 
45
  __pycache__
46
+ dist
47
 
48
  dist/
49
  *.spec
 
57
 
58
  examples/jeopardy/results.txt
59
 
 
60
  poetry.lock
61
  poetry.toml
62
 
63
  # Test binaries
64
+ tests/test-grammar-parser
65
  tests/test-double-float
66
  tests/test-grad0
67
  tests/test-opt
 
70
  tests/test-sampling
71
  tests/test-tokenizer-0
72
 
73
+ /koboldcpp_default.so
74
+ /koboldcpp_failsafe.so
75
+ /koboldcpp_openblas.so
76
+ /koboldcpp_noavx2.so
77
+ /koboldcpp_clblast.so
78
+ /koboldcpp_cublas.so
79
+ /koboldcpp_default.dll
80
+ /koboldcpp_failsafe.dll
81
+ /koboldcpp_openblas.dll
82
+ /koboldcpp_noavx2.dll
83
+ /koboldcpp_clblast.dll
84
+ /koboldcpp_cublas.dll
85
+ /cublas64_11.dll
86
+ /cublasLt64_11.dll
87
+ /rocblas/
88
+ rocblas.dll
89
+ hipblas.dll
90
+ koboldcpp_hipblas.so
CMakeLists.txt CHANGED
@@ -50,6 +50,9 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA
50
  set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
51
  option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
52
  set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 
 
 
53
  option(LLAMA_K_QUANTS "llama: use k-quants" ON)
54
 
55
 
@@ -65,6 +68,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
65
  find_package(Threads REQUIRED)
66
 
67
  add_compile_definitions(GGML_USE_K_QUANTS)
 
 
 
 
 
68
 
69
  if (LLAMA_CUBLAS)
70
  cmake_minimum_required(VERSION 3.17)
@@ -75,10 +83,6 @@ if (LLAMA_CUBLAS)
75
 
76
  enable_language(CUDA)
77
 
78
- set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
79
- set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
80
- set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
81
-
82
  add_compile_definitions(GGML_USE_CUBLAS)
83
  #add_compile_definitions(GGML_CUDA_CUBLAS) #remove to not use cublas
84
  add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
@@ -91,6 +95,7 @@ if (LLAMA_CUBLAS)
91
  add_compile_definitions(GGML_CUDA_F16)
92
  endif()
93
  add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
 
94
 
95
  if (LLAMA_STATIC)
96
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -121,6 +126,75 @@ if (LLAMA_CUBLAS)
121
  endif()
122
  endif()
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  if (LLAMA_ALL_WARNINGS)
125
  if (NOT MSVC)
126
  set(c_flags
@@ -133,15 +207,22 @@ if (LLAMA_ALL_WARNINGS)
133
  -Wstrict-prototypes
134
  -Wpointer-arith
135
  -Wmissing-prototypes
 
 
136
  )
137
  set(cxx_flags
138
  -Wall
139
  -Wextra
140
  -Wpedantic
141
  -Wcast-qual
 
142
  -Wno-unused-function
143
  -Wno-multichar
144
  )
 
 
 
 
145
  else()
146
  # todo : msvc
147
  endif()
@@ -153,7 +234,7 @@ if (LLAMA_ALL_WARNINGS)
153
 
154
  endif()
155
 
156
- if (MSVC)
157
  add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
158
 
159
  if (BUILD_SHARED_LIBS)
@@ -190,7 +271,7 @@ if (NOT MSVC)
190
  endif()
191
  endif()
192
 
193
- if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
194
  message(STATUS "ARM detected")
195
  if (MSVC)
196
  # TODO: arm msvc?
@@ -301,37 +382,52 @@ target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
301
  set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
302
 
303
  add_library(common2
304
- examples/common.cpp
305
- examples/common.h)
306
- target_include_directories(common2 PUBLIC . ./otherarch ./otherarch/tools ./examples)
 
 
307
  target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
308
  target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
309
  set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
310
 
311
  add_library(gpttype_adapter
312
  gpttype_adapter.cpp)
313
- target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples)
314
  target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
315
  target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
316
  set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
317
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- set(TARGET koboldcpp_cublas)
320
- add_library(${TARGET} SHARED expose.cpp expose.h)
321
- target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples)
322
- target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
323
- set_target_properties(${TARGET} PROPERTIES PREFIX "")
324
- set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
325
- set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
326
- target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
327
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
 
328
 
329
 
330
  if (MAKE_MISC_FILES)
 
331
  add_library(llama
332
  llama.cpp
333
  llama.h
334
- llama-util.h
335
  )
336
  target_include_directories(llama PUBLIC .)
337
  target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 
50
  set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
51
  option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
52
  set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
53
+ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
54
+ "llama: max. batch size for using peer access")
55
+ option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
56
  option(LLAMA_K_QUANTS "llama: use k-quants" ON)
57
 
58
 
 
68
  find_package(Threads REQUIRED)
69
 
70
  add_compile_definitions(GGML_USE_K_QUANTS)
71
+ add_compile_definitions(LOG_DISABLE_LOGS)
72
+
73
+ set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
74
+ set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
75
+ set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
76
 
77
  if (LLAMA_CUBLAS)
78
  cmake_minimum_required(VERSION 3.17)
 
83
 
84
  enable_language(CUDA)
85
 
 
 
 
 
86
  add_compile_definitions(GGML_USE_CUBLAS)
87
  #add_compile_definitions(GGML_CUDA_CUBLAS) #remove to not use cublas
88
  add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
 
95
  add_compile_definitions(GGML_CUDA_F16)
96
  endif()
97
  add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
98
+ add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
99
 
100
  if (LLAMA_STATIC)
101
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
 
126
  endif()
127
  endif()
128
 
129
+ if (LLAMA_HIPBLAS)
130
+ if (MSVC)
131
+ list(APPEND CMAKE_PREFIX_PATH "C:/Program Files/AMD/ROCm/5.5")
132
+ else()
133
+ list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
134
+ endif()
135
+
136
+
137
+ if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
138
+ message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
139
+ endif()
140
+ if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
141
+ message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
142
+ endif()
143
+
144
+ find_package(hip)
145
+ find_package(hipblas)
146
+ find_package(rocblas)
147
+
148
+ if (${hipblas_FOUND} AND ${hip_FOUND})
149
+ message(STATUS "HIP and hipBLAS found")
150
+ add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
151
+ add_library(ggml-rocm OBJECT ${GGML_SOURCES_CUDA})
152
+ if (LLAMA_CUDA_FORCE_DMMV)
153
+ target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
154
+ endif()
155
+ target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
156
+ target_compile_definitions(ggml-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
157
+ target_compile_definitions(ggml-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
158
+ target_compile_definitions(ggml-rocm PUBLIC CC_TURING=1000000000)
159
+ set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
160
+ target_link_libraries(ggml-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
161
+
162
+
163
+ add_library(ggml-v2-rocm OBJECT ${GGML_V2_CUDA_SOURCES})
164
+ if (LLAMA_CUDA_FORCE_DMMV)
165
+ target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
166
+ endif()
167
+ target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
168
+ target_compile_definitions(ggml-v2-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
169
+ target_compile_definitions(ggml-v2-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
170
+ target_compile_definitions(ggml-v2-rocm PUBLIC CC_TURING=1000000000)
171
+ set_source_files_properties(otherarch/ggml_v2-cuda.cu PROPERTIES LANGUAGE CXX)
172
+ target_link_libraries(ggml-v2-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
173
+
174
+
175
+ add_library(ggml-v2-legacy-rocm OBJECT ${GGML_V2_LEGACY_CUDA_SOURCES})
176
+ if (LLAMA_CUDA_FORCE_DMMV)
177
+ target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_FORCE_DMMV)
178
+ endif()
179
+ target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
180
+ target_compile_definitions(ggml-v2-legacy-rocm PUBLIC GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
181
+ target_compile_definitions(ggml-v2-legacy-rocm PUBLIC K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
182
+ target_compile_definitions(ggml-v2-legacy-rocm PUBLIC CC_TURING=1000000000)
183
+ set_source_files_properties(otherarch/ggml_v2-cuda-legacy.cu PROPERTIES LANGUAGE CXX)
184
+ target_link_libraries(ggml-v2-legacy-rocm PUBLIC hip::device hip::host roc::rocblas roc::hipblas)
185
+
186
+
187
+
188
+
189
+ if (LLAMA_STATIC)
190
+ message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
191
+ endif()
192
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm ggml-v2-rocm ggml-v2-legacy-rocm)
193
+ else()
194
+ message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
195
+ endif()
196
+ endif()
197
+
198
  if (LLAMA_ALL_WARNINGS)
199
  if (NOT MSVC)
200
  set(c_flags
 
207
  -Wstrict-prototypes
208
  -Wpointer-arith
209
  -Wmissing-prototypes
210
+ -Werror=implicit-int
211
+ -Wno-unused-function
212
  )
213
  set(cxx_flags
214
  -Wall
215
  -Wextra
216
  -Wpedantic
217
  -Wcast-qual
218
+ -Wmissing-declarations
219
  -Wno-unused-function
220
  -Wno-multichar
221
  )
222
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
223
+ # g++ only
224
+ set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
225
+ endif()
226
  else()
227
  # todo : msvc
228
  endif()
 
234
 
235
  endif()
236
 
237
+ if (WIN32)
238
  add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
239
 
240
  if (BUILD_SHARED_LIBS)
 
271
  endif()
272
  endif()
273
 
274
+ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64"))
275
  message(STATUS "ARM detected")
276
  if (MSVC)
277
  # TODO: arm msvc?
 
382
  set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
383
 
384
  add_library(common2
385
+ common/common.cpp
386
+ common/common.h
387
+ common/grammar-parser.h
388
+ common/grammar-parser.cpp)
389
+ target_include_directories(common2 PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
390
  target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
391
  target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
392
  set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
393
 
394
  add_library(gpttype_adapter
395
  gpttype_adapter.cpp)
396
+ target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
397
  target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
398
  target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
399
  set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
400
 
401
+ if (LLAMA_CUBLAS)
402
+ set(TARGET koboldcpp_cublas)
403
+ add_library(${TARGET} SHARED expose.cpp expose.h)
404
+ target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
405
+ target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
406
+ set_target_properties(${TARGET} PROPERTIES PREFIX "")
407
+ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
408
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
409
+ target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS})
410
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
411
+ endif()
412
 
413
+ if (LLAMA_HIPBLAS)
414
+ set(TARGET koboldcpp_hipblas)
415
+ add_library(${TARGET} SHARED expose.cpp expose.h)
416
+ target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples ./common)
417
+ target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
418
+ set_target_properties(${TARGET} PROPERTIES PREFIX "")
419
+ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
420
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
421
+ target_link_libraries(${TARGET} PUBLIC Threads::Threads ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${LLAMA_EXTRA_LIBS})
422
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
423
+ endif()
424
 
425
 
426
  if (MAKE_MISC_FILES)
427
+ add_subdirectory(common)
428
  add_library(llama
429
  llama.cpp
430
  llama.h
 
431
  )
432
  target_include_directories(llama PUBLIC .)
433
  target_compile_features(llama PUBLIC cxx_std_11) # don't bump
Dockerfile CHANGED
@@ -5,6 +5,7 @@ RUN apt update \
5
  && apt install build-essential wget libopenblas-dev make -y \
6
  && make LLAMA_OPENBLAS=1 \
7
  && wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \
8
- && apt remove build-essential wget make -y
 
9
 
10
  ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-13b-ggml-q4_0.bin", "--port", "7860"]
 
5
  && apt install build-essential wget libopenblas-dev make -y \
6
  && make LLAMA_OPENBLAS=1 \
7
  && wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \
8
+ && apt remove build-essential wget make -y \
9
+ && rm -fr *.bat convert-* ci docs examples otherarchs tests
10
 
11
  ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-13b-ggml-q4_0.bin", "--port", "7860"]
MIT_LICENSE_GGML_LLAMACPP_ONLY CHANGED
@@ -23,4 +23,4 @@ SOFTWARE.
23
  ===================================
24
 
25
  Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
26
- Kobold Lite by Concedo and the provided python ctypes bindings in koboldcpp.dll are licensed under the AGPL v3.0 License
 
23
  ===================================
24
 
25
  Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
26
+ Kobold Lite by Concedo and the provided python ctypes bindings in koboldcpp dlls are licensed under the AGPL v3.0 License
Makefile CHANGED
@@ -1,4 +1,4 @@
1
- default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas
2
  tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
3
  dev: koboldcpp_openblas
4
  dev2: koboldcpp_clblast
@@ -20,8 +20,6 @@ ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/nul
20
  ARCH_ADD = -lcblas
21
  endif
22
 
23
- CCV := $(shell $(CC) --version | head -n 1)
24
- CXXV := $(shell $(CXX) --version | head -n 1)
25
 
26
  # Mac OS + Arm can report x86_64
27
  # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
@@ -41,8 +39,8 @@ endif
41
  #
42
 
43
  # keep standard at C11 and C++11
44
- CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
45
- CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
46
  LDFLAGS =
47
 
48
  # these are used on windows, to build some libraries with extra old device compatibility
@@ -110,7 +108,8 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
110
  # old library NEEDS mf16c to work. so we must build with it. new one doesnt
111
  ifeq ($(OS),Windows_NT)
112
  CFLAGS +=
113
- NONECFLAGS += -mno-sse3
 
114
  SIMPLECFLAGS += -mavx -msse3
115
  FULLCFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx
116
  else
@@ -195,6 +194,42 @@ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-l
195
  $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
196
  endif # LLAMA_CUBLAS
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  ifdef LLAMA_METAL
199
  CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
200
  CXXFLAGS += -DGGML_USE_METAL
@@ -224,12 +259,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
224
  CFLAGS += -mfp16-format=ieee -mno-unaligned-access
225
  endif
226
 
 
 
 
227
  DEFAULT_BUILD =
228
  FAILSAFE_BUILD =
229
  OPENBLAS_BUILD =
230
  NOAVX2_BUILD =
231
  CLBLAST_BUILD =
232
  CUBLAS_BUILD =
 
233
 
234
  ifeq ($(OS),Windows_NT)
235
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
@@ -238,10 +277,12 @@ ifeq ($(OS),Windows_NT)
238
  NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
239
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
240
 
241
- ifdef LLAMA_CUBLAS
242
- CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
243
- endif
244
-
 
 
245
  else
246
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
247
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
@@ -250,24 +291,29 @@ else
250
  NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
251
  endif
252
  ifdef LLAMA_CLBLAST
253
- ifeq ($(UNAME_S),Darwin)
254
- CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
255
- else
256
- CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
257
- endif
258
  endif
259
 
260
- ifdef LLAMA_CUBLAS
261
- CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
262
- endif
 
 
 
263
 
264
  ifndef LLAMA_OPENBLAS
265
  ifndef LLAMA_CLBLAST
266
  ifndef LLAMA_CUBLAS
 
267
  OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
268
  endif
269
  endif
270
  endif
 
271
  endif
272
 
273
 
@@ -293,16 +339,16 @@ $(info )
293
 
294
  ggml.o: ggml.c ggml.h ggml-cuda.h k_quants.h
295
  $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
296
- ggml_openblas.o: ggml.c ggml.h
297
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
298
- ggml_failsafe.o: ggml.c ggml.h
299
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
300
- ggml_noavx2.o: ggml.c ggml.h
301
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
302
- ggml_clblast.o: ggml.c ggml.h
303
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
304
- ggml_cublas.o: ggml.c ggml.h
305
- $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
306
 
307
  #quants K
308
  k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
@@ -328,7 +374,7 @@ ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
328
  ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
329
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
330
  ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
331
- $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
332
 
333
  #extreme old version compat
334
  ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
@@ -345,19 +391,19 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
345
  $(CC) $(CFLAGS) -c $< -o $@
346
 
347
  # intermediate objects
348
- llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
349
  $(CXX) $(CXXFLAGS) -c $< -o $@
350
- common.o: examples/common.cpp examples/common.h
351
  $(CXX) $(CXXFLAGS) -c $< -o $@
352
- console.o: examples/console.cpp examples/console.h
353
  $(CXX) $(CXXFLAGS) -c $< -o $@
354
- grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
355
  $(CXX) $(CXXFLAGS) -c $< -o $@
356
  expose.o: expose.cpp expose.h
357
  $(CXX) $(CXXFLAGS) -c $< -o $@
358
 
359
  # idiotic "for easier compilation"
360
- GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml.h ggml-cuda.h llama.h llama-util.h
361
  gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
362
  $(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
363
  gpttype_adapter.o: $(GPTTYPE_ADAPTER)
@@ -365,10 +411,10 @@ gpttype_adapter.o: $(GPTTYPE_ADAPTER)
365
  gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER)
366
  $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
367
  gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
368
- $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
369
 
370
  clean:
371
- rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
372
 
373
  main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
374
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -376,19 +422,24 @@ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o
376
  @echo '==== Run ./main -h for help. ===='
377
  @echo
378
 
 
 
 
379
  #generated libraries
380
- koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS)
381
  $(DEFAULT_BUILD)
382
- koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS)
383
  $(OPENBLAS_BUILD)
384
- koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o $(OBJS)
385
  $(FAILSAFE_BUILD)
386
- koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o $(OBJS)
387
  $(NOAVX2_BUILD)
388
- koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o $(OBJS)
389
  $(CLBLAST_BUILD)
390
- koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(OBJS)
391
  $(CUBLAS_BUILD)
 
 
392
 
393
  quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
394
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
1
+ default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas koboldcpp_hipblas
2
  tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
3
  dev: koboldcpp_openblas
4
  dev2: koboldcpp_clblast
 
20
  ARCH_ADD = -lcblas
21
  endif
22
 
 
 
23
 
24
  # Mac OS + Arm can report x86_64
25
  # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 
39
  #
40
 
41
  # keep standard at C11 and C++11
42
+ CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
43
+ CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
44
  LDFLAGS =
45
 
46
  # these are used on windows, to build some libraries with extra old device compatibility
 
108
  # old library NEEDS mf16c to work. so we must build with it. new one doesnt
109
  ifeq ($(OS),Windows_NT)
110
  CFLAGS +=
111
+ NONECFLAGS +=
112
+ # -mno-sse3
113
  SIMPLECFLAGS += -mavx -msse3
114
  FULLCFLAGS += -mavx2 -msse3 -mfma -mf16c -mavx
115
  else
 
194
  $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
195
  endif # LLAMA_CUBLAS
196
 
197
+ ifdef LLAMA_HIPBLAS
198
+ ROCM_PATH ?= /opt/rocm
199
+ CC := $(ROCM_PATH)/llvm/bin/clang
200
+ CXX := $(ROCM_PATH)/llvm/bin/clang++
201
+ GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
202
+ LLAMA_CUDA_DMMV_X ?= 32
203
+ LLAMA_CUDA_MMV_Y ?= 2
204
+ LLAMA_CUDA_KQUANTS_ITER ?= 2
205
+ HIPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
206
+ ifdef LLAMA_CUDA_FORCE_DMMV
207
+ HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
208
+ endif # LLAMA_CUDA_FORCE_DMMV
209
+ HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64 -lrocblas
210
+ HIP_OBJS += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
211
+ ggml-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
212
+ -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
213
+ -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
214
+ -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
215
+ ggml_v2-cuda.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
216
+ -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
217
+ -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
218
+ -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
219
+ ggml_v2-cuda-legacy.o: HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) \
220
+ -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) \
221
+ -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) \
222
+ -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
223
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
224
+ $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
225
+ ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
226
+ $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
227
+ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
228
+ $(CXX) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
229
+ endif # LLAMA_HIPBLAS
230
+
231
+
232
+
233
  ifdef LLAMA_METAL
234
  CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
235
  CXXFLAGS += -DGGML_USE_METAL
 
259
  CFLAGS += -mfp16-format=ieee -mno-unaligned-access
260
  endif
261
 
262
+ CCV := $(shell $(CC) --version | head -n 1)
263
+ CXXV := $(shell $(CXX) --version | head -n 1)
264
+
265
  DEFAULT_BUILD =
266
  FAILSAFE_BUILD =
267
  OPENBLAS_BUILD =
268
  NOAVX2_BUILD =
269
  CLBLAST_BUILD =
270
  CUBLAS_BUILD =
271
+ HIPBLAS_BUILD =
272
 
273
  ifeq ($(OS),Windows_NT)
274
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
 
277
  NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
278
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
279
 
280
+ ifdef LLAMA_CUBLAS
281
+ CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
282
+ endif
283
+ ifdef LLAMA_HIPBLAS
284
+ HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o [email protected] $(HIPLDFLAGS) $(LDFLAGS)
285
+ endif
286
  else
287
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
288
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
 
291
  NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
292
  endif
293
  ifdef LLAMA_CLBLAST
294
+ ifeq ($(UNAME_S),Darwin)
295
+ CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
296
+ else
297
+ CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
298
+ endif
299
  endif
300
 
301
+ ifdef LLAMA_CUBLAS
302
+ CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
303
+ endif
304
+ ifdef LLAMA_HIPBLAS
305
+ HIPBLAS_BUILD = $(CXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o [email protected] $(HIPLDFLAGS) $(LDFLAGS)
306
+ endif
307
 
308
  ifndef LLAMA_OPENBLAS
309
  ifndef LLAMA_CLBLAST
310
  ifndef LLAMA_CUBLAS
311
+ ifndef LLAMA_HIPBLAS
312
  OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
313
  endif
314
  endif
315
  endif
316
+ endif
317
  endif
318
 
319
 
 
339
 
340
  ggml.o: ggml.c ggml.h ggml-cuda.h k_quants.h
341
  $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
342
+ ggml_openblas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
343
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
344
+ ggml_failsafe.o: ggml.c ggml.h ggml-cuda.h k_quants.h
345
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
346
+ ggml_noavx2.o: ggml.c ggml.h ggml-cuda.h k_quants.h
347
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
348
+ ggml_clblast.o: ggml.c ggml.h ggml-cuda.h k_quants.h
349
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
350
+ ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
351
+ $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
352
 
353
  #quants K
354
  k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
 
374
  ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
375
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
376
  ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
377
+ $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
378
 
379
  #extreme old version compat
380
  ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
 
391
  $(CC) $(CFLAGS) -c $< -o $@
392
 
393
  # intermediate objects
394
+ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
395
  $(CXX) $(CXXFLAGS) -c $< -o $@
396
+ common.o: common/common.cpp common/common.h common/log.h
397
  $(CXX) $(CXXFLAGS) -c $< -o $@
398
+ console.o: common/console.cpp common/console.h
399
  $(CXX) $(CXXFLAGS) -c $< -o $@
400
+ grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
401
  $(CXX) $(CXXFLAGS) -c $< -o $@
402
  expose.o: expose.cpp expose.h
403
  $(CXX) $(CXXFLAGS) -c $< -o $@
404
 
405
  # idiotic "for easier compilation"
406
+ GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp llama.cpp otherarch/utils.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml.h ggml-cuda.h llama.h otherarch/llama-util.h
407
  gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
408
  $(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
409
  gpttype_adapter.o: $(GPTTYPE_ADAPTER)
 
411
  gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER)
412
  $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
413
  gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
414
+ $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
415
 
416
  clean:
417
+ rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
418
 
419
  main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
420
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
422
  @echo '==== Run ./main -h for help. ===='
423
  @echo
424
 
425
+ gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
426
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
427
+
428
  #generated libraries
429
+ koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
430
  $(DEFAULT_BUILD)
431
+ koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
432
  $(OPENBLAS_BUILD)
433
+ koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS)
434
  $(FAILSAFE_BUILD)
435
+ koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS)
436
  $(NOAVX2_BUILD)
437
+ koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
438
  $(CLBLAST_BUILD)
439
+ koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
440
  $(CUBLAS_BUILD)
441
+ koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
442
+ $(HIPBLAS_BUILD)
443
 
444
  quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
445
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
Package.swift CHANGED
@@ -2,8 +2,30 @@
2
 
3
  import PackageDescription
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  let package = Package(
6
  name: "llama",
 
7
  products: [
8
  .library(name: "llama", targets: ["llama"]),
9
  ],
@@ -11,14 +33,23 @@ let package = Package(
11
  .target(
12
  name: "llama",
13
  path: ".",
14
- exclude: ["ggml-metal.metal"],
15
- sources: ["ggml.c", "llama.cpp"],
 
 
 
 
 
16
  publicHeadersPath: "spm-headers",
17
- cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
 
 
 
 
18
  linkerSettings: [
19
  .linkedFramework("Accelerate")
20
  ]
21
- ),
22
  ],
23
  cxxLanguageStandard: .cxx11
24
  )
 
2
 
3
  import PackageDescription
4
 
5
+ #if arch(arm) || arch(arm64)
6
+ let platforms: [SupportedPlatform]? = [
7
+ .macOS(.v11),
8
+ .iOS(.v14),
9
+ .watchOS(.v4),
10
+ .tvOS(.v14)
11
+ ]
12
+ let exclude: [String] = []
13
+ let additionalSources: [String] = ["ggml-metal.m"]
14
+ let additionalSettings: [CSetting] = [
15
+ .unsafeFlags(["-fno-objc-arc"]),
16
+ .define("GGML_SWIFT"),
17
+ .define("GGML_USE_METAL")
18
+ ]
19
+ #else
20
+ let platforms: [SupportedPlatform]? = nil
21
+ let exclude: [String] = ["ggml-metal.metal"]
22
+ let additionalSources: [String] = []
23
+ let additionalSettings: [CSetting] = []
24
+ #endif
25
+
26
  let package = Package(
27
  name: "llama",
28
+ platforms: platforms,
29
  products: [
30
  .library(name: "llama", targets: ["llama"]),
31
  ],
 
33
  .target(
34
  name: "llama",
35
  path: ".",
36
+ exclude: exclude,
37
+ sources: [
38
+ "ggml.c",
39
+ "llama.cpp",
40
+ "ggml-alloc.c",
41
+ "k_quants.c",
42
+ ] + additionalSources,
43
  publicHeadersPath: "spm-headers",
44
+ cSettings: [
45
+ .unsafeFlags(["-Wno-shorten-64-to-32"]),
46
+ .define("GGML_USE_K_QUANTS"),
47
+ .define("GGML_USE_ACCELERATE")
48
+ ] + additionalSettings,
49
  linkerSettings: [
50
  .linkedFramework("Accelerate")
51
  ]
52
+ )
53
  ],
54
  cxxLanguageStandard: .cxx11
55
  )
README.md CHANGED
@@ -3,4 +3,4 @@ sdk: docker
3
  emoji: 🚀
4
  colorFrom: yellow
5
  colorTo: blue
6
- ---
 
3
  emoji: 🚀
4
  colorFrom: yellow
5
  colorTo: blue
6
+ ---
build-info.h CHANGED
@@ -3,5 +3,7 @@
3
 
4
  #define BUILD_NUMBER 999
5
  #define BUILD_COMMIT "KOBOLDCPP"
 
 
6
 
7
  #endif // BUILD_INFO_H
 
3
 
4
  #define BUILD_NUMBER 999
5
  #define BUILD_COMMIT "KOBOLDCPP"
6
+ #define BUILD_COMPILER "KCPP"
7
+ #define BUILD_TARGET "KCPP"
8
 
9
  #endif // BUILD_INFO_H
ci/run.sh CHANGED
@@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
159
 
160
  python3 ../convert.py ${path_models}
161
 
162
- model_f16="${path_models}/ggml-model-f16.bin"
163
- model_q8_0="${path_models}/ggml-model-q8_0.bin"
164
- model_q4_0="${path_models}/ggml-model-q4_0.bin"
165
- model_q4_1="${path_models}/ggml-model-q4_1.bin"
166
- model_q5_0="${path_models}/ggml-model-q5_0.bin"
167
- model_q5_1="${path_models}/ggml-model-q5_1.bin"
168
- model_q2_k="${path_models}/ggml-model-q2_k.bin"
169
- model_q3_k="${path_models}/ggml-model-q3_k.bin"
170
- model_q4_k="${path_models}/ggml-model-q4_k.bin"
171
- model_q5_k="${path_models}/ggml-model-q5_k.bin"
172
- model_q6_k="${path_models}/ggml-model-q6_k.bin"
173
 
174
  wiki_test_60="${path_wiki}/wiki.test-60.raw"
175
 
@@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 {
196
  (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
197
  (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
198
 
199
- (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
200
- (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
201
- (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
202
- (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
203
- (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
204
- (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
205
- (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
206
- (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
207
- (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
208
- (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
209
- (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
210
 
211
  function check_ppl {
212
  qnt="$1"
@@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 {
233
  check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
234
  check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  set +e
237
  }
238
 
@@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 {
242
  gg_printf 'OpenLLaMA 3B-v2:\n'
243
  gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
244
  gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
 
245
  gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
246
  gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
247
  gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 {
253
  gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
254
  gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
255
  gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
 
 
 
 
 
256
  }
257
 
258
  # open_llama_7b_v2
@@ -285,17 +333,17 @@ function gg_run_open_llama_7b_v2 {
285
 
286
  python3 ../convert.py ${path_models}
287
 
288
- model_f16="${path_models}/ggml-model-f16.bin"
289
- model_q8_0="${path_models}/ggml-model-q8_0.bin"
290
- model_q4_0="${path_models}/ggml-model-q4_0.bin"
291
- model_q4_1="${path_models}/ggml-model-q4_1.bin"
292
- model_q5_0="${path_models}/ggml-model-q5_0.bin"
293
- model_q5_1="${path_models}/ggml-model-q5_1.bin"
294
- model_q2_k="${path_models}/ggml-model-q2_k.bin"
295
- model_q3_k="${path_models}/ggml-model-q3_k.bin"
296
- model_q4_k="${path_models}/ggml-model-q4_k.bin"
297
- model_q5_k="${path_models}/ggml-model-q5_k.bin"
298
- model_q6_k="${path_models}/ggml-model-q6_k.bin"
299
 
300
  wiki_test="${path_wiki}/wiki.test.raw"
301
 
@@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 {
310
  ./bin/quantize ${model_f16} ${model_q5_k} q5_k
311
  ./bin/quantize ${model_f16} ${model_q6_k} q6_k
312
 
313
- (time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
314
- (time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
315
- (time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
316
- (time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
317
- (time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
318
- (time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
319
- (time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
320
- (time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
321
- (time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
322
- (time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
323
- (time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
324
 
325
  (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
326
  (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 {
359
  check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
360
  check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  set +e
363
  }
364
 
@@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 {
368
  gg_printf 'OpenLLaMA 7B-v2:\n'
369
  gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
370
  gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
 
371
  gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
372
  gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
373
  gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 {
379
  gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
380
  gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
381
  gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
 
 
 
 
 
382
  }
383
 
384
  ## main
@@ -391,6 +487,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
391
  ln -sfn ${mnt_models} ${SRC}/models-mnt
392
 
393
  python3 -m pip install -r ${SRC}/requirements.txt
 
394
  fi
395
 
396
  ret=0
 
159
 
160
  python3 ../convert.py ${path_models}
161
 
162
+ model_f16="${path_models}/ggml-model-f16.gguf"
163
+ model_q8_0="${path_models}/ggml-model-q8_0.gguf"
164
+ model_q4_0="${path_models}/ggml-model-q4_0.gguf"
165
+ model_q4_1="${path_models}/ggml-model-q4_1.gguf"
166
+ model_q5_0="${path_models}/ggml-model-q5_0.gguf"
167
+ model_q5_1="${path_models}/ggml-model-q5_1.gguf"
168
+ model_q2_k="${path_models}/ggml-model-q2_k.gguf"
169
+ model_q3_k="${path_models}/ggml-model-q3_k.gguf"
170
+ model_q4_k="${path_models}/ggml-model-q4_k.gguf"
171
+ model_q5_k="${path_models}/ggml-model-q5_k.gguf"
172
+ model_q6_k="${path_models}/ggml-model-q6_k.gguf"
173
 
174
  wiki_test_60="${path_wiki}/wiki.test-60.raw"
175
 
 
196
  (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
197
  (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
198
 
199
+ (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
200
+ (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
201
+ (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
202
+ (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
203
+ (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
204
+ (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
205
+ (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
206
+ (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
207
+ (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
208
+ (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
209
+ (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
210
 
211
  function check_ppl {
212
  qnt="$1"
 
233
  check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
234
  check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
235
 
236
+ # lora
237
+ function compare_ppl {
238
+ qnt="$1"
239
+ ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
240
+ ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
241
+
242
+ if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
243
+ printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
244
+ return 20
245
+ fi
246
+
247
+ printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
248
+ return 0
249
+ }
250
+
251
+ path_lora="../models-mnt/open-llama/3B-v2/lora"
252
+ path_shakespeare="../models-mnt/shakespeare"
253
+
254
+ shakespeare="${path_shakespeare}/shakespeare.txt"
255
+ lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
256
+
257
+ gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
258
+ gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
259
+ gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
260
+
261
+ python3 ../convert-lora-to-ggml.py ${path_lora}
262
+
263
+ # f16
264
+ (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
265
+ (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
266
+ compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
267
+
268
+ # q8_0
269
+ (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
270
+ (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
271
+ compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
272
+
273
+ # q8_0 + f16 lora-base
274
+ (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
275
+ compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
276
+
277
+
278
  set +e
279
  }
280
 
 
284
  gg_printf 'OpenLLaMA 3B-v2:\n'
285
  gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
286
  gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
287
+ gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
288
  gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
289
  gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
290
  gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
 
296
  gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
297
  gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
298
  gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
299
+ gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
300
+ gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
301
+ gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
302
+ gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
303
+ gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
304
  }
305
 
306
  # open_llama_7b_v2
 
333
 
334
  python3 ../convert.py ${path_models}
335
 
336
+ model_f16="${path_models}/ggml-model-f16.gguf"
337
+ model_q8_0="${path_models}/ggml-model-q8_0.gguf"
338
+ model_q4_0="${path_models}/ggml-model-q4_0.gguf"
339
+ model_q4_1="${path_models}/ggml-model-q4_1.gguf"
340
+ model_q5_0="${path_models}/ggml-model-q5_0.gguf"
341
+ model_q5_1="${path_models}/ggml-model-q5_1.gguf"
342
+ model_q2_k="${path_models}/ggml-model-q2_k.gguf"
343
+ model_q3_k="${path_models}/ggml-model-q3_k.gguf"
344
+ model_q4_k="${path_models}/ggml-model-q4_k.gguf"
345
+ model_q5_k="${path_models}/ggml-model-q5_k.gguf"
346
+ model_q6_k="${path_models}/ggml-model-q6_k.gguf"
347
 
348
  wiki_test="${path_wiki}/wiki.test.raw"
349
 
 
358
  ./bin/quantize ${model_f16} ${model_q5_k} q5_k
359
  ./bin/quantize ${model_f16} ${model_q6_k} q6_k
360
 
361
+ (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
362
+ (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
363
+ (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
364
+ (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
365
+ (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
366
+ (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
367
+ (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
368
+ (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
369
+ (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
370
+ (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
371
+ (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
372
 
373
  (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
374
  (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
 
407
  check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
408
  check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
409
 
410
+ # lora
411
+ function compare_ppl {
412
+ qnt="$1"
413
+ ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
414
+ ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
415
+
416
+ if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
417
+ printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
418
+ return 20
419
+ fi
420
+
421
+ printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
422
+ return 0
423
+ }
424
+
425
+ path_lora="../models-mnt/open-llama/7B-v2/lora"
426
+ path_shakespeare="../models-mnt/shakespeare"
427
+
428
+ shakespeare="${path_shakespeare}/shakespeare.txt"
429
+ lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
430
+
431
+ gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
432
+ gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
433
+ gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
434
+
435
+ python3 ../convert-lora-to-ggml.py ${path_lora}
436
+
437
+ # f16
438
+ (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
439
+ (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
440
+ compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
441
+
442
+ # currently not supported by the CUDA backend
443
+ # q8_0
444
+ #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
445
+ #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
446
+ #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
447
+
448
+ # q8_0 + f16 lora-base
449
+ #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
450
+ #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
451
+
452
  set +e
453
  }
454
 
 
458
  gg_printf 'OpenLLaMA 7B-v2:\n'
459
  gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
460
  gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
461
+ gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
462
  gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
463
  gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
464
  gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
 
470
  gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
471
  gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
472
  gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
473
+ gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
474
+ gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
475
+ #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
476
+ #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
477
+ #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
478
  }
479
 
480
  ## main
 
487
  ln -sfn ${mnt_models} ${SRC}/models-mnt
488
 
489
  python3 -m pip install -r ${SRC}/requirements.txt
490
+ python3 -m pip install --editable gguf-py
491
  fi
492
 
493
  ret=0
class.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## KoboldCpp based GGML Backend by Concedo
2
+ ## For use as a custom backend in KoboldAI United
3
+ ## Not intended for general use.
4
+
5
+ from __future__ import annotations
6
+
7
+ import time, json
8
+ import torch
9
+ import requests
10
+ import numpy as np
11
+ from typing import List, Optional, Union
12
+ import os
13
+ from . import koboldcpp
14
+
15
+ import utils
16
+ from logger import logger
17
+ from modeling.inference_model import (
18
+ GenerationResult,
19
+ GenerationSettings,
20
+ InferenceModel,
21
+ )
22
+
23
+ model_backend_name = "koboldcpp" #specific instead of ggml
24
+ model_backend_type = "ggml" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
25
+
26
+ kcpp_backend_loaded = False
27
+
28
+ class KoboldCppException(Exception):
29
+ """To be used for errors on cpp side of KoboldCpp."""
30
+
31
+ class KcppArgsObject:
32
+ def __init__(self, **kwargs):
33
+ self.__dict__.update(kwargs)
34
+
35
+ class model_backend(InferenceModel):
36
+ def __init__(self) -> None:
37
+ super().__init__()
38
+
39
+ def is_valid(self, model_name, model_path, menu_path):
40
+
41
+ foundfile = False
42
+ try:
43
+ files = os.listdir(model_path)
44
+ foundfile = len([filename for filename in files if (("ggml" in filename.lower() and ".bin" in filename.lower()) or ".gguf" in filename.lower())])>0
45
+ except:
46
+ pass
47
+ return foundfile
48
+
49
+ def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
50
+
51
+ self.kcpp_threads = 5
52
+ self.model_name = "GGML_Model"
53
+ self.kcpp_ctxsize = 2048
54
+ self.kcpp_blasbatchsize = 512
55
+ self.kcpp_gpulayers = 0
56
+ self.kcpp_smartcontext = False
57
+ self.kcpp_ropescale = 0.0
58
+ self.kcpp_ropebase = 10000.0
59
+ self.kcpp_useclblast = None
60
+ self.kcpp_usecublas = None
61
+ self.kcpp_noblas = False
62
+ self.kcpp_noavx2 = False
63
+ self.kcpp_nommap = False
64
+ self.kcpp_debugmode = 0
65
+ self.kcpp_tensor_split_str = ""
66
+ self.kcpp_tensor_split = None
67
+
68
+ files = os.listdir(model_path)
69
+ foundfiles = [filename for filename in files if (("ggml" in filename.lower() and ".bin" in filename.lower()) or ".gguf" in filename.lower())]
70
+
71
+ requested_parameters = []
72
+ foldermdls = []
73
+ for ff in foundfiles:
74
+ foldermdls.append({'text': ff, 'value': os.path.join(model_path, ff)})
75
+ requested_parameters.append({
76
+ "uitype": "dropdown",
77
+ "unit": "string",
78
+ "label": "GGML DataFile Name",
79
+ "id": "kcpp_filename",
80
+ "default": os.path.join(model_path, foundfiles[0]) if len(foundfiles)>0 else model_name,
81
+ "check": {"value": "", 'check': "!="},
82
+ "tooltip": "Actual GGML DataFile Name",
83
+ "menu_path": "",
84
+ "refresh_model_inputs": False,
85
+ "extra_classes": "",
86
+ 'children': foldermdls
87
+ })
88
+ requested_parameters.append({
89
+ "uitype": "dropdown",
90
+ "unit": "int",
91
+ "label": "KoboldCpp Accelerator",
92
+ "id": "kcpp_accelerator",
93
+ "default": 0,
94
+ "check": {"value": "", 'check': "!="},
95
+ 'multiple': False,
96
+ "tooltip": "KoboldCpp Accelerator",
97
+ "menu_path": "",
98
+ "refresh_model_inputs": False,
99
+ "extra_classes": "",
100
+ 'children': [{'text': 'Use No BLAS', 'value': 0}, {'text': 'Use OpenBLAS', 'value': 1}, {'text': 'Use CuBLAS', 'value': 2},
101
+ {'text': 'Use CLBLast GPU #1', 'value': 3},{'text': 'Use CLBLast GPU #2', 'value': 4},{'text': 'Use CLBLast GPU #3', 'value': 5}
102
+ ,{'text': 'NoAVX2 Mode (Old CPU)', 'value': 6},{'text': 'Failsafe Mode (Old CPU)', 'value': 7}],
103
+ })
104
+ requested_parameters.append({
105
+ "uitype": "text",
106
+ "unit": "int",
107
+ "label": "Threads",
108
+ "id": "kcpp_threads",
109
+ "default": self.kcpp_threads,
110
+ "check": {"value": "", 'check': "!="},
111
+ "tooltip": "Thread Count",
112
+ "menu_path": "",
113
+ "refresh_model_inputs": False,
114
+ "extra_classes": ""
115
+ })
116
+
117
+ requested_parameters.append({
118
+ "uitype": "text",
119
+ "unit": "int",
120
+ "label": "Max Context Size",
121
+ "id": "kcpp_ctxsize",
122
+ "default": self.kcpp_ctxsize,
123
+ "check": {"value": "", 'check': "!="},
124
+ "tooltip": "Max Context Size",
125
+ "menu_path": "",
126
+ "refresh_model_inputs": False,
127
+ "extra_classes": ""
128
+ })
129
+ requested_parameters.append({
130
+ "uitype": "text",
131
+ "unit": "int",
132
+ "label": "BLAS Batch Size",
133
+ "id": "kcpp_blasbatchsize",
134
+ "default": self.kcpp_blasbatchsize,
135
+ "check": {"value": "", 'check': "!="},
136
+ "tooltip": "BLAS Batch Size",
137
+ "menu_path": "",
138
+ "refresh_model_inputs": False,
139
+ "extra_classes": ""
140
+ })
141
+ requested_parameters.append({
142
+ "uitype": "text",
143
+ "unit": "int",
144
+ "label": "GPU Layers",
145
+ "id": "kcpp_gpulayers",
146
+ "default": self.kcpp_gpulayers,
147
+ "check": {"value": "", 'check': "!="},
148
+ "tooltip": "GPU Layers",
149
+ "menu_path": "",
150
+ "refresh_model_inputs": False,
151
+ "extra_classes": ""
152
+ })
153
+ requested_parameters.append({
154
+ "uitype": "text",
155
+ "unit": "int",
156
+ "label": "Rope Scale",
157
+ "id": "kcpp_ropescale",
158
+ "default": self.kcpp_ropescale,
159
+ "check": {"value": "", 'check': "!="},
160
+ "tooltip": "Rope Scale",
161
+ "menu_path": "",
162
+ "refresh_model_inputs": False,
163
+ "extra_classes": ""
164
+ })
165
+ requested_parameters.append({
166
+ "uitype": "text",
167
+ "unit": "int",
168
+ "label": "Rope Base",
169
+ "id": "kcpp_ropebase",
170
+ "default": self.kcpp_ropebase,
171
+ "check": {"value": "", 'check': "!="},
172
+ "tooltip": "Rope Base",
173
+ "menu_path": "",
174
+ "refresh_model_inputs": False,
175
+ "extra_classes": ""
176
+ })
177
+ requested_parameters.append({
178
+ "uitype": "dropdown",
179
+ "unit": "int",
180
+ "label": "Smart Context",
181
+ "id": "kcpp_smartcontext",
182
+ "default": self.kcpp_smartcontext,
183
+ "check": {"value": "", 'check': "!="},
184
+ 'multiple': False,
185
+ "tooltip": "Smart Context",
186
+ "menu_path": "",
187
+ "refresh_model_inputs": False,
188
+ "extra_classes": "",
189
+ 'children': [{'text': 'False', 'value': False}, {'text': 'True', 'value': True}],
190
+ })
191
+ requested_parameters.append({
192
+ "uitype": "dropdown",
193
+ "unit": "int",
194
+ "label": "Debug Mode",
195
+ "id": "kcpp_debugmode",
196
+ "default": self.kcpp_debugmode,
197
+ "check": {"value": "", 'check': "!="},
198
+ 'multiple': False,
199
+ "tooltip": "Debug Mode",
200
+ "menu_path": "",
201
+ "refresh_model_inputs": False,
202
+ "extra_classes": "",
203
+ 'children': [{'text': 'False', 'value': 0}, {'text': 'True', 'value': 1}],
204
+ })
205
+ requested_parameters.append({
206
+ "uitype": "text",
207
+ "unit": "text",
208
+ "label": "Tensor Split",
209
+ "id": "kcpp_tensor_split_str",
210
+ "default": self.kcpp_tensor_split_str,
211
+ "check": {"value": "", 'check': "!="},
212
+ "tooltip": "Tensor Split, values are space separated",
213
+ "menu_path": "",
214
+ "refresh_model_inputs": False,
215
+ "extra_classes": ""
216
+ })
217
+ return requested_parameters
218
+
219
+ def set_input_parameters(self, parameters):
220
+ self.kcpp_threads = parameters["kcpp_threads"]
221
+ self.kcpp_filename = parameters["kcpp_filename"]
222
+ self.kcpp_ctxsize = parameters["kcpp_ctxsize"]
223
+ self.kcpp_blasbatchsize = parameters["kcpp_blasbatchsize"]
224
+ self.kcpp_gpulayers = parameters["kcpp_gpulayers"]
225
+ self.kcpp_smartcontext = parameters["kcpp_smartcontext"]
226
+ self.kcpp_ropescale = parameters["kcpp_ropescale"]
227
+ self.kcpp_ropebase = parameters["kcpp_ropebase"]
228
+ self.kcpp_debugmode = parameters["kcpp_debugmode"]
229
+ self.kcpp_tensor_split_str = parameters["kcpp_tensor_split_str"]
230
+ if self.kcpp_tensor_split_str and self.kcpp_tensor_split_str!="":
231
+ splits = self.kcpp_tensor_split_str.split()
232
+ self.kcpp_tensor_split = []
233
+ for s in splits:
234
+ self.kcpp_tensor_split.append(int(s))
235
+
236
+ accel = parameters["kcpp_accelerator"]
237
+ if accel==0:
238
+ self.kcpp_noblas = True
239
+ elif accel==1:
240
+ pass
241
+ elif accel==2:
242
+ self.kcpp_usecublas = ["normal"]
243
+ elif accel==3:
244
+ self.kcpp_useclblast = [0,0]
245
+ elif accel==4:
246
+ self.kcpp_useclblast = [1,0]
247
+ elif accel==5:
248
+ self.kcpp_useclblast = [0,1]
249
+ elif accel==6:
250
+ self.kcpp_noavx2 = True
251
+ elif accel==7:
252
+ self.kcpp_noavx2 = True
253
+ self.kcpp_noblas = True
254
+ self.kcpp_nommap = True
255
+ pass
256
+
257
+ def unload(self):
258
+ print("Attemping to unload library")
259
+ koboldcpp.unload_libs()
260
+ global kcpp_backend_loaded
261
+ kcpp_backend_loaded = False
262
+ pass
263
+
264
+ def _load(self, save_model: bool, initial_load: bool) -> None:
265
+ global kcpp_backend_loaded
266
+ self.tokenizer = self._get_tokenizer("gpt2")
267
+ if not kcpp_backend_loaded:
268
+ kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename,
269
+ port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads,
270
+ psutil_set_threads=False, highpriority=False, contextsize=self.kcpp_ctxsize,
271
+ blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext,
272
+ unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap,
273
+ usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
274
+ useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None, onready='', multiuser=False)
275
+
276
+ koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
277
+ kcpp_backend_loaded = True
278
+ pass
279
+
280
+ def _save_settings(self):
281
+ pass
282
+
283
+ def _raw_generate(
284
+ self,
285
+ prompt_tokens: Union[List[int], torch.Tensor],
286
+ max_new: int,
287
+ gen_settings: GenerationSettings,
288
+ single_line: bool = False,
289
+ batch_count: int = 1,
290
+ seed: Optional[int] = None,
291
+ **kwargs,
292
+ ) -> GenerationResult:
293
+
294
+ decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))
295
+
296
+ # Store context in memory to use it for comparison with generated content
297
+ utils.koboldai_vars.lastctx = decoded_prompt
298
+
299
+ genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length,
300
+ gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p,
301
+ gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range,
302
+ sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids)
303
+
304
+ outputs = [genresult]
305
+ return GenerationResult(
306
+ model=self,
307
+ out_batches=np.array(
308
+ [self.tokenizer.encode(x) for x in outputs]
309
+ ),
310
+ prompt=prompt_tokens,
311
+ is_whole_generation=True,
312
+ single_line=single_line,
313
+ )
codecov.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ comment: off
2
+
3
+ coverage:
4
+ status:
5
+ project:
6
+ default:
7
+ target: auto
8
+ threshold: 0
9
+ base: auto
10
+ patch:
11
+ default:
12
+ target: auto
13
+ threshold: 0
14
+ base: auto
colab.ipynb ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "private_outputs": true,
7
+ "provenance": [],
8
+ "gpuType": "T4",
9
+ "authorship_tag": "ABX9TyOv14c2MWENhO6RJ3uy6vD7",
10
+ "include_colab_link": true
11
+ },
12
+ "kernelspec": {
13
+ "name": "python3",
14
+ "display_name": "Python 3"
15
+ },
16
+ "language_info": {
17
+ "name": "python"
18
+ },
19
+ "accelerator": "GPU"
20
+ },
21
+ "cells": [
22
+ {
23
+ "cell_type": "markdown",
24
+ "metadata": {
25
+ "id": "view-in-github",
26
+ "colab_type": "text"
27
+ },
28
+ "source": [
29
+ "<a href=\"https://colab.research.google.com/github/henk717/koboldcpp/blob/concedo/colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "metadata": {
36
+ "cellView": "form",
37
+ "id": "uJS9i_Dltv8Y"
38
+ },
39
+ "outputs": [],
40
+ "source": [
41
+ "#@title <b>v-- Enter your model below and then click this to start Koboldcpp</b>\n",
42
+ "\n",
43
+ "Model = \"https://huggingface.co/TheBloke/airoboros-l2-13B-gpt4-1.4.1-GGML/resolve/main/airoboros-l2-13b-gpt4-1.4.1.ggmlv3.q4_0.bin\" #@param [\"\"]{allow-input: true}\n",
44
+ "Layers = 43 #@param [43]{allow-input: true}\n",
45
+ "\n",
46
+ "%cd /content\n",
47
+ "!git clone https://github.com/LostRuins/koboldcpp\n",
48
+ "%cd /content/koboldcpp\n",
49
+ "!make LLAMA_CUBLAS=1\n",
50
+ "\n",
51
+ "!wget $Model -O model.ggml\n",
52
+ "!wget -c https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64\n",
53
+ "!chmod +x cloudflared-linux-amd64\n",
54
+ "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\n",
55
+ "!sleep 10\n",
56
+ "!cat nohup.out\n",
57
+ "!python koboldcpp.py model.ggml --stream --usecublas 0 --gpulayers $Layers --hordeconfig concedo\n"
58
+ ]
59
+ }
60
+ ]
61
+ }
common/CMakeLists.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # common
2
+
3
+ set(TARGET common)
4
+
5
+ add_library(${TARGET} OBJECT
6
+ common.h
7
+ common.cpp
8
+ console.h
9
+ console.cpp
10
+ grammar-parser.h
11
+ grammar-parser.cpp
12
+ )
13
+
14
+ if (BUILD_SHARED_LIBS)
15
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
16
+ endif()
17
+
18
+ target_include_directories(${TARGET} PUBLIC .)
19
+ target_compile_features(${TARGET} PUBLIC cxx_std_11)
20
+ target_link_libraries(${TARGET} PRIVATE llama)
common/common.cpp ADDED
@@ -0,0 +1,1270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.h"
2
+ #include "build-info.h"
3
+ #include "llama.h"
4
+
5
+ #include <algorithm>
6
+ #include <cassert>
7
+ #include <cmath>
8
+ #include <cstring>
9
+ #include <ctime>
10
+ #include <fstream>
11
+ #include <iterator>
12
+ #include <iostream>
13
+ #include <regex>
14
+ #include <sstream>
15
+ #include <string>
16
+ #include <unordered_set>
17
+ #include <vector>
18
+ #include <cinttypes>
19
+
20
+ #if defined(__APPLE__) && defined(__MACH__)
21
+ #include <sys/types.h>
22
+ #include <sys/sysctl.h>
23
+ #endif
24
+
25
+ #if defined(_WIN32)
26
+ #define WIN32_LEAN_AND_MEAN
27
+ #ifndef NOMINMAX
28
+ # define NOMINMAX
29
+ #endif
30
+ #include <codecvt>
31
+ #include <locale>
32
+ #include <windows.h>
33
+ #include <fcntl.h>
34
+ #include <io.h>
35
+ #else
36
+ #include <sys/ioctl.h>
37
+ #include <sys/stat.h>
38
+ #include <unistd.h>
39
+ #endif
40
+
41
+ #if defined(_MSC_VER)
42
+ #pragma warning(disable: 4244 4267) // possible loss of data
43
+ #endif
44
+
45
+ int32_t get_num_physical_cores() {
46
+ #ifdef __linux__
47
+ // enumerate the set of thread siblings, num entries is num cores
48
+ std::unordered_set<std::string> siblings;
49
+ for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
50
+ std::ifstream thread_siblings("/sys/devices/system/cpu"
51
+ + std::to_string(cpu) + "/topology/thread_siblings");
52
+ if (!thread_siblings.is_open()) {
53
+ break; // no more cpus
54
+ }
55
+ std::string line;
56
+ if (std::getline(thread_siblings, line)) {
57
+ siblings.insert(line);
58
+ }
59
+ }
60
+ if (!siblings.empty()) {
61
+ return static_cast<int32_t>(siblings.size());
62
+ }
63
+ #elif defined(__APPLE__) && defined(__MACH__)
64
+ int32_t num_physical_cores;
65
+ size_t len = sizeof(num_physical_cores);
66
+ int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
67
+ if (result == 0) {
68
+ return num_physical_cores;
69
+ }
70
+ result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
71
+ if (result == 0) {
72
+ return num_physical_cores;
73
+ }
74
+ #elif defined(_WIN32)
75
+ //TODO: Implement
76
+ #endif
77
+ unsigned int n_threads = std::thread::hardware_concurrency();
78
+ return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
79
+ }
80
+
81
+ static void process_escapes(std::string& input) {
82
+ std::size_t input_len = input.length();
83
+ std::size_t output_idx = 0;
84
+
85
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
86
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
87
+ switch (input[++input_idx]) {
88
+ case 'n': input[output_idx++] = '\n'; break;
89
+ case 'r': input[output_idx++] = '\r'; break;
90
+ case 't': input[output_idx++] = '\t'; break;
91
+ case '\'': input[output_idx++] = '\''; break;
92
+ case '\"': input[output_idx++] = '\"'; break;
93
+ case '\\': input[output_idx++] = '\\'; break;
94
+ default: input[output_idx++] = '\\';
95
+ input[output_idx++] = input[input_idx]; break;
96
+ }
97
+ } else {
98
+ input[output_idx++] = input[input_idx];
99
+ }
100
+ }
101
+
102
+ input.resize(output_idx);
103
+ }
104
+
105
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
106
+ bool invalid_param = false;
107
+ std::string arg;
108
+ gpt_params default_params;
109
+ const std::string arg_prefix = "--";
110
+
111
+ for (int i = 1; i < argc; i++) {
112
+ arg = argv[i];
113
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
114
+ std::replace(arg.begin(), arg.end(), '_', '-');
115
+ }
116
+
117
+ if (arg == "-s" || arg == "--seed") {
118
+ if (++i >= argc) {
119
+ invalid_param = true;
120
+ break;
121
+ }
122
+ params.seed = std::stoul(argv[i]);
123
+ } else if (arg == "-t" || arg == "--threads") {
124
+ if (++i >= argc) {
125
+ invalid_param = true;
126
+ break;
127
+ }
128
+ params.n_threads = std::stoi(argv[i]);
129
+ if (params.n_threads <= 0) {
130
+ params.n_threads = std::thread::hardware_concurrency();
131
+ }
132
+ } else if (arg == "-p" || arg == "--prompt") {
133
+ if (++i >= argc) {
134
+ invalid_param = true;
135
+ break;
136
+ }
137
+ params.prompt = argv[i];
138
+ } else if (arg == "-e" || arg == "--escape") {
139
+ params.escape = true;
140
+ } else if (arg == "--prompt-cache") {
141
+ if (++i >= argc) {
142
+ invalid_param = true;
143
+ break;
144
+ }
145
+ params.path_prompt_cache = argv[i];
146
+ } else if (arg == "--prompt-cache-all") {
147
+ params.prompt_cache_all = true;
148
+ } else if (arg == "--prompt-cache-ro") {
149
+ params.prompt_cache_ro = true;
150
+ } else if (arg == "-f" || arg == "--file") {
151
+ if (++i >= argc) {
152
+ invalid_param = true;
153
+ break;
154
+ }
155
+ std::ifstream file(argv[i]);
156
+ if (!file) {
157
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
158
+ invalid_param = true;
159
+ break;
160
+ }
161
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
162
+ if (params.prompt.back() == '\n') {
163
+ params.prompt.pop_back();
164
+ }
165
+ } else if (arg == "-n" || arg == "--n-predict") {
166
+ if (++i >= argc) {
167
+ invalid_param = true;
168
+ break;
169
+ }
170
+ params.n_predict = std::stoi(argv[i]);
171
+ } else if (arg == "--top-k") {
172
+ if (++i >= argc) {
173
+ invalid_param = true;
174
+ break;
175
+ }
176
+ params.top_k = std::stoi(argv[i]);
177
+ } else if (arg == "-c" || arg == "--ctx-size") {
178
+ if (++i >= argc) {
179
+ invalid_param = true;
180
+ break;
181
+ }
182
+ params.n_ctx = std::stoi(argv[i]);
183
+ } else if (arg == "--rope-freq-base") {
184
+ if (++i >= argc) {
185
+ invalid_param = true;
186
+ break;
187
+ }
188
+ params.rope_freq_base = std::stof(argv[i]);
189
+ } else if (arg == "--rope-freq-scale") {
190
+ if (++i >= argc) {
191
+ invalid_param = true;
192
+ break;
193
+ }
194
+ params.rope_freq_scale = std::stof(argv[i]);
195
+ } else if (arg == "--rope-scale") {
196
+ if (++i >= argc) {
197
+ invalid_param = true;
198
+ break;
199
+ }
200
+ params.rope_freq_scale = 1.0f/std::stof(argv[i]);
201
+ } else if (arg == "--memory-f32") {
202
+ params.memory_f16 = false;
203
+ } else if (arg == "--top-p") {
204
+ if (++i >= argc) {
205
+ invalid_param = true;
206
+ break;
207
+ }
208
+ params.top_p = std::stof(argv[i]);
209
+ } else if (arg == "--temp") {
210
+ if (++i >= argc) {
211
+ invalid_param = true;
212
+ break;
213
+ }
214
+ params.temp = std::stof(argv[i]);
215
+ } else if (arg == "--tfs") {
216
+ if (++i >= argc) {
217
+ invalid_param = true;
218
+ break;
219
+ }
220
+ params.tfs_z = std::stof(argv[i]);
221
+ } else if (arg == "--typical") {
222
+ if (++i >= argc) {
223
+ invalid_param = true;
224
+ break;
225
+ }
226
+ params.typical_p = std::stof(argv[i]);
227
+ } else if (arg == "--repeat-last-n") {
228
+ if (++i >= argc) {
229
+ invalid_param = true;
230
+ break;
231
+ }
232
+ params.repeat_last_n = std::stoi(argv[i]);
233
+ } else if (arg == "--repeat-penalty") {
234
+ if (++i >= argc) {
235
+ invalid_param = true;
236
+ break;
237
+ }
238
+ params.repeat_penalty = std::stof(argv[i]);
239
+ } else if (arg == "--frequency-penalty") {
240
+ if (++i >= argc) {
241
+ invalid_param = true;
242
+ break;
243
+ }
244
+ params.frequency_penalty = std::stof(argv[i]);
245
+ } else if (arg == "--presence-penalty") {
246
+ if (++i >= argc) {
247
+ invalid_param = true;
248
+ break;
249
+ }
250
+ params.presence_penalty = std::stof(argv[i]);
251
+ } else if (arg == "--mirostat") {
252
+ if (++i >= argc) {
253
+ invalid_param = true;
254
+ break;
255
+ }
256
+ params.mirostat = std::stoi(argv[i]);
257
+ } else if (arg == "--mirostat-lr") {
258
+ if (++i >= argc) {
259
+ invalid_param = true;
260
+ break;
261
+ }
262
+ params.mirostat_eta = std::stof(argv[i]);
263
+ } else if (arg == "--mirostat-ent") {
264
+ if (++i >= argc) {
265
+ invalid_param = true;
266
+ break;
267
+ }
268
+ params.mirostat_tau = std::stof(argv[i]);
269
+ } else if (arg == "--cfg-negative-prompt") {
270
+ if (++i >= argc) {
271
+ invalid_param = true;
272
+ break;
273
+ }
274
+ params.cfg_negative_prompt = argv[i];
275
+ } else if (arg == "--cfg-negative-prompt-file") {
276
+ if (++i >= argc) {
277
+ invalid_param = true;
278
+ break;
279
+ }
280
+ std::ifstream file(argv[i]);
281
+ if (!file) {
282
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
283
+ invalid_param = true;
284
+ break;
285
+ }
286
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
287
+ if (params.cfg_negative_prompt.back() == '\n') {
288
+ params.cfg_negative_prompt.pop_back();
289
+ }
290
+ } else if (arg == "--cfg-scale") {
291
+ if (++i >= argc) {
292
+ invalid_param = true;
293
+ break;
294
+ }
295
+ params.cfg_scale = std::stof(argv[i]);
296
+ } else if (arg == "-b" || arg == "--batch-size") {
297
+ if (++i >= argc) {
298
+ invalid_param = true;
299
+ break;
300
+ }
301
+ params.n_batch = std::stoi(argv[i]);
302
+ } else if (arg == "--keep") {
303
+ if (++i >= argc) {
304
+ invalid_param = true;
305
+ break;
306
+ }
307
+ params.n_keep = std::stoi(argv[i]);
308
+ } else if (arg == "--draft") {
309
+ if (++i >= argc) {
310
+ invalid_param = true;
311
+ break;
312
+ }
313
+ params.n_draft = std::stoi(argv[i]);
314
+ } else if (arg == "--chunks") {
315
+ if (++i >= argc) {
316
+ invalid_param = true;
317
+ break;
318
+ }
319
+ params.n_chunks = std::stoi(argv[i]);
320
+ } else if (arg == "-m" || arg == "--model") {
321
+ if (++i >= argc) {
322
+ invalid_param = true;
323
+ break;
324
+ }
325
+ params.model = argv[i];
326
+ } else if (arg == "-md" || arg == "--model-draft") {
327
+ if (++i >= argc) {
328
+ invalid_param = true;
329
+ break;
330
+ }
331
+ params.model_draft = argv[i];
332
+ } else if (arg == "-a" || arg == "--alias") {
333
+ if (++i >= argc) {
334
+ invalid_param = true;
335
+ break;
336
+ }
337
+ params.model_alias = argv[i];
338
+ } else if (arg == "--lora") {
339
+ if (++i >= argc) {
340
+ invalid_param = true;
341
+ break;
342
+ }
343
+ params.lora_adapter = argv[i];
344
+ params.use_mmap = false;
345
+ } else if (arg == "--lora-base") {
346
+ if (++i >= argc) {
347
+ invalid_param = true;
348
+ break;
349
+ }
350
+ params.lora_base = argv[i];
351
+ } else if (arg == "-i" || arg == "--interactive") {
352
+ params.interactive = true;
353
+ } else if (arg == "--embedding") {
354
+ params.embedding = true;
355
+ } else if (arg == "--interactive-first") {
356
+ params.interactive_first = true;
357
+ } else if (arg == "-ins" || arg == "--instruct") {
358
+ params.instruct = true;
359
+ } else if (arg == "--multiline-input") {
360
+ params.multiline_input = true;
361
+ } else if (arg == "--simple-io") {
362
+ params.simple_io = true;
363
+ } else if (arg == "--color") {
364
+ params.use_color = true;
365
+ } else if (arg == "--mlock") {
366
+ params.use_mlock = true;
367
+ } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
368
+ if (++i >= argc) {
369
+ invalid_param = true;
370
+ break;
371
+ }
372
+ #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
373
+ params.n_gpu_layers = std::stoi(argv[i]);
374
+ #else
375
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
376
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
377
+ #endif
378
+ } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
379
+ if (++i >= argc) {
380
+ invalid_param = true;
381
+ break;
382
+ }
383
+ #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
384
+ params.n_gpu_layers_draft = std::stoi(argv[i]);
385
+ #else
386
+ fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
387
+ fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
388
+ #endif
389
+ } else if (arg == "--main-gpu" || arg == "-mg") {
390
+ if (++i >= argc) {
391
+ invalid_param = true;
392
+ break;
393
+ }
394
+ #ifdef GGML_USE_CUBLAS
395
+ params.main_gpu = std::stoi(argv[i]);
396
+ #else
397
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
398
+ #endif
399
+ } else if (arg == "--tensor-split" || arg == "-ts") {
400
+ if (++i >= argc) {
401
+ invalid_param = true;
402
+ break;
403
+ }
404
+ #ifdef GGML_USE_CUBLAS
405
+ std::string arg_next = argv[i];
406
+
407
+ // split string by , and /
408
+ const std::regex regex{R"([,/]+)"};
409
+ std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
410
+ std::vector<std::string> split_arg{it, {}};
411
+ GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
412
+
413
+ for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
414
+ if (i < split_arg.size()) {
415
+ params.tensor_split[i] = std::stof(split_arg[i]);
416
+ } else {
417
+ params.tensor_split[i] = 0.0f;
418
+ }
419
+ }
420
+ #else
421
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
422
+ #endif // GGML_USE_CUBLAS
423
+ } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
424
+ #ifdef GGML_USE_CUBLAS
425
+ params.mul_mat_q = false;
426
+ #else
427
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
428
+ #endif // GGML_USE_CUBLAS
429
+ } else if (arg == "--low-vram" || arg == "-lv") {
430
+ #ifdef GGML_USE_CUBLAS
431
+ params.low_vram = true;
432
+ #else
433
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
434
+ #endif // GGML_USE_CUBLAS
435
+ } else if (arg == "--no-mmap") {
436
+ params.use_mmap = false;
437
+ } else if (arg == "--numa") {
438
+ params.numa = true;
439
+ } else if (arg == "--export") {
440
+ params.export_cgraph = true;
441
+ } else if (arg == "--verbose-prompt") {
442
+ params.verbose_prompt = true;
443
+ } else if (arg == "-r" || arg == "--reverse-prompt") {
444
+ if (++i >= argc) {
445
+ invalid_param = true;
446
+ break;
447
+ }
448
+ params.antiprompt.push_back(argv[i]);
449
+ } else if (arg == "-ld" || arg == "--logdir") {
450
+ if (++i >= argc) {
451
+ invalid_param = true;
452
+ break;
453
+ }
454
+ params.logdir = argv[i];
455
+
456
+ if (params.logdir.back() != DIRECTORY_SEPARATOR) {
457
+ params.logdir += DIRECTORY_SEPARATOR;
458
+ }
459
+ } else if (arg == "--perplexity") {
460
+ params.perplexity = true;
461
+ } else if (arg == "--ppl-stride") {
462
+ if (++i >= argc) {
463
+ invalid_param = true;
464
+ break;
465
+ }
466
+ params.ppl_stride = std::stoi(argv[i]);
467
+ } else if (arg == "--ppl-output-type") {
468
+ if (++i >= argc) {
469
+ invalid_param = true;
470
+ break;
471
+ }
472
+ params.ppl_output_type = std::stoi(argv[i]);
473
+ } else if (arg == "--hellaswag") {
474
+ params.hellaswag = true;
475
+ } else if (arg == "--hellaswag-tasks") {
476
+ if (++i >= argc) {
477
+ invalid_param = true;
478
+ break;
479
+ }
480
+ params.hellaswag_tasks = std::stoi(argv[i]);
481
+ } else if (arg == "--ignore-eos") {
482
+ params.ignore_eos = true;
483
+ } else if (arg == "--no-penalize-nl") {
484
+ params.penalize_nl = false;
485
+ } else if (arg == "-l" || arg == "--logit-bias") {
486
+ if (++i >= argc) {
487
+ invalid_param = true;
488
+ break;
489
+ }
490
+ std::stringstream ss(argv[i]);
491
+ llama_token key;
492
+ char sign;
493
+ std::string value_str;
494
+ try {
495
+ if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
496
+ params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
497
+ } else {
498
+ throw std::exception();
499
+ }
500
+ } catch (const std::exception&) {
501
+ invalid_param = true;
502
+ break;
503
+ }
504
+ } else if (arg == "-h" || arg == "--help") {
505
+ gpt_print_usage(argc, argv, default_params);
506
+ #ifndef LOG_DISABLE_LOGS
507
+ log_print_usage();
508
+ #endif // LOG_DISABLE_LOGS
509
+ exit(0);
510
+ } else if (arg == "--random-prompt") {
511
+ params.random_prompt = true;
512
+ } else if (arg == "--in-prefix-bos") {
513
+ params.input_prefix_bos = true;
514
+ } else if (arg == "--in-prefix") {
515
+ if (++i >= argc) {
516
+ invalid_param = true;
517
+ break;
518
+ }
519
+ params.input_prefix = argv[i];
520
+ } else if (arg == "--in-suffix") {
521
+ if (++i >= argc) {
522
+ invalid_param = true;
523
+ break;
524
+ }
525
+ params.input_suffix = argv[i];
526
+ } else if (arg == "--grammar") {
527
+ if (++i >= argc) {
528
+ invalid_param = true;
529
+ break;
530
+ }
531
+ params.grammar = argv[i];
532
+ } else if (arg == "--grammar-file") {
533
+ if (++i >= argc) {
534
+ invalid_param = true;
535
+ break;
536
+ }
537
+ std::ifstream file(argv[i]);
538
+ if (!file) {
539
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
540
+ invalid_param = true;
541
+ break;
542
+ }
543
+ std::copy(
544
+ std::istreambuf_iterator<char>(file),
545
+ std::istreambuf_iterator<char>(),
546
+ std::back_inserter(params.grammar)
547
+ );
548
+ #ifndef LOG_DISABLE_LOGS
549
+ // Parse args for logging parameters
550
+ } else if ( log_param_single_parse( argv[i] ) ) {
551
+ // Do nothing, log_param_single_parse automatically does it's thing
552
+ // and returns if a match was found and parsed.
553
+ } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
554
+ // We have a matching known parameter requiring an argument,
555
+ // now we need to check if there is anything after this argv
556
+ // and flag invalid_param or parse it.
557
+ if (++i >= argc) {
558
+ invalid_param = true;
559
+ break;
560
+ }
561
+ if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) {
562
+ invalid_param = true;
563
+ break;
564
+ }
565
+ // End of Parse args for logging parameters
566
+ #endif // LOG_DISABLE_LOGS
567
+ } else {
568
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
569
+ gpt_print_usage(argc, argv, default_params);
570
+ exit(1);
571
+ }
572
+ }
573
+ if (invalid_param) {
574
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
575
+ gpt_print_usage(argc, argv, default_params);
576
+ exit(1);
577
+ }
578
+ if (params.prompt_cache_all &&
579
+ (params.interactive || params.interactive_first ||
580
+ params.instruct)) {
581
+ fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
582
+ gpt_print_usage(argc, argv, default_params);
583
+ exit(1);
584
+ }
585
+
586
+ if (params.escape) {
587
+ process_escapes(params.prompt);
588
+ process_escapes(params.input_prefix);
589
+ process_escapes(params.input_suffix);
590
+ }
591
+
592
+ return true;
593
+ }
594
+
595
+ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
596
+ printf("usage: %s [options]\n", argv[0]);
597
+ printf("\n");
598
+ printf("options:\n");
599
+ printf(" -h, --help show this help message and exit\n");
600
+ printf(" -i, --interactive run in interactive mode\n");
601
+ printf(" --interactive-first run in interactive mode and wait for input right away\n");
602
+ printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
603
+ printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
604
+ printf(" -r PROMPT, --reverse-prompt PROMPT\n");
605
+ printf(" halt generation at PROMPT, return control in interactive mode\n");
606
+ printf(" (can be specified more than once for multiple prompts).\n");
607
+ printf(" --color colorise output to distinguish prompt and user input from generations\n");
608
+ printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
609
+ printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
610
+ printf(" -p PROMPT, --prompt PROMPT\n");
611
+ printf(" prompt to start generation with (default: empty)\n");
612
+ printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
613
+ printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
614
+ printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
615
+ printf(" not supported with --interactive or other interactive options\n");
616
+ printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
617
+ printf(" --random-prompt start with a randomized prompt.\n");
618
+ printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
619
+ printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
620
+ printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
621
+ printf(" -f FNAME, --file FNAME\n");
622
+ printf(" prompt file to start generation.\n");
623
+ printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
624
+ printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
625
+ printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
626
+ printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
627
+ printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
628
+ printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
629
+ printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
630
+ printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
631
+ printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
632
+ printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
633
+ printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
634
+ printf(" --mirostat N use Mirostat sampling.\n");
635
+ printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
636
+ printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
637
+ printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
638
+ printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
639
+ printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
640
+ printf(" modifies the likelihood of token appearing in the completion,\n");
641
+ printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
642
+ printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
643
+ printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
644
+ printf(" --grammar-file FNAME file to read grammar from\n");
645
+ printf(" --cfg-negative-prompt PROMPT\n");
646
+ printf(" negative prompt to use for guidance. (default: empty)\n");
647
+ printf(" --cfg-negative-prompt-file FNAME\n");
648
+ printf(" negative prompt file to use for guidance. (default: empty)\n");
649
+ printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
650
+ printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
651
+ printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
652
+ printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
653
+ printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
654
+ printf(" --no-penalize-nl do not penalize newline token\n");
655
+ printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
656
+ printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
657
+ printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
658
+ printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
659
+ printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
660
+ printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
661
+ printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
662
+ printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
663
+ printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
664
+ if (llama_mlock_supported()) {
665
+ printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
666
+ }
667
+ if (llama_mmap_supported()) {
668
+ printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
669
+ }
670
+ printf(" --numa attempt optimizations that help on some NUMA systems\n");
671
+ printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
672
+ printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
673
+ #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
674
+ printf(" -ngl N, --n-gpu-layers N\n");
675
+ printf(" number of layers to store in VRAM\n");
676
+ printf(" -ngld N, --n-gpu-layers-draft N\n");
677
+ printf(" number of layers to store in VRAM for the draft model\n");
678
+ printf(" -ts SPLIT --tensor-split SPLIT\n");
679
+ printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
680
+ printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
681
+ printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
682
+ #ifdef GGML_USE_CUBLAS
683
+ printf(" -nommq, --no-mul-mat-q\n");
684
+ printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
685
+ printf(" Not recommended since this is both slower and uses more VRAM.\n");
686
+ #endif // GGML_USE_CUBLAS
687
+ #endif
688
+ printf(" --export export the computation graph to 'llama.ggml'\n");
689
+ printf(" --verbose-prompt print prompt before generation\n");
690
+ fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
691
+ printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
692
+ printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
693
+ printf(" -m FNAME, --model FNAME\n");
694
+ printf(" model path (default: %s)\n", params.model.c_str());
695
+ printf(" -md FNAME, --model-draft FNAME\n");
696
+ printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
697
+ printf(" -ld LOGDIR, --logdir LOGDIR\n");
698
+ printf(" path under which to save YAML logs (no logging if unset)\n");
699
+ printf("\n");
700
+ }
701
+
702
+ std::string gpt_random_prompt(std::mt19937 & rng) {
703
+ const int r = rng() % 10;
704
+ switch (r) {
705
+ case 0: return "So";
706
+ case 1: return "Once upon a time";
707
+ case 2: return "When";
708
+ case 3: return "The";
709
+ case 4: return "After";
710
+ case 5: return "If";
711
+ case 6: return "import";
712
+ case 7: return "He";
713
+ case 8: return "She";
714
+ case 9: return "They";
715
+ default: return "To";
716
+ }
717
+
718
+ return "The";
719
+ }
720
+
721
+ //
722
+ // Model utils
723
+ //
724
+
725
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
726
+ auto lparams = llama_context_default_params();
727
+
728
+ lparams.n_ctx = params.n_ctx;
729
+ lparams.n_batch = params.n_batch;
730
+ if (params.n_gpu_layers != -1) {
731
+ lparams.n_gpu_layers = params.n_gpu_layers;
732
+ }
733
+ lparams.main_gpu = params.main_gpu;
734
+ lparams.tensor_split = params.tensor_split;
735
+ lparams.low_vram = params.low_vram;
736
+ lparams.mul_mat_q = params.mul_mat_q;
737
+ lparams.seed = params.seed;
738
+ lparams.f16_kv = params.memory_f16;
739
+ lparams.use_mmap = params.use_mmap;
740
+ lparams.use_mlock = params.use_mlock;
741
+ lparams.logits_all = params.perplexity;
742
+ lparams.embedding = params.embedding;
743
+ lparams.rope_freq_base = params.rope_freq_base;
744
+ lparams.rope_freq_scale = params.rope_freq_scale;
745
+
746
+ return lparams;
747
+ }
748
+
749
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
750
+ auto lparams = llama_context_params_from_gpt_params(params);
751
+
752
+ llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
753
+ if (model == NULL) {
754
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
755
+ return std::make_tuple(nullptr, nullptr);
756
+ }
757
+
758
+ llama_context * lctx = llama_new_context_with_model(model, lparams);
759
+ if (lctx == NULL) {
760
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
761
+ llama_free_model(model);
762
+ return std::make_tuple(nullptr, nullptr);
763
+ }
764
+
765
+ if (!params.lora_adapter.empty()) {
766
+ int err = llama_model_apply_lora_from_file(model,
767
+ params.lora_adapter.c_str(),
768
+ params.lora_base.empty() ? NULL : params.lora_base.c_str(),
769
+ params.n_threads);
770
+ if (err != 0) {
771
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
772
+ llama_free(lctx);
773
+ llama_free_model(model);
774
+ return std::make_tuple(nullptr, nullptr);
775
+ }
776
+ }
777
+
778
+ if (params.ignore_eos) {
779
+ params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
780
+ }
781
+
782
+ {
783
+ LOG("warming up the model with an empty run\n");
784
+
785
+ const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
786
+ llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
787
+ llama_reset_timings(lctx);
788
+ }
789
+
790
+ return std::make_tuple(model, lctx);
791
+ }
792
+
793
+ //
794
+ // Vocab utils
795
+ //
796
+
797
+ std::vector<llama_token> llama_tokenize(
798
+ struct llama_context * ctx,
799
+ const std::string & text,
800
+ bool add_bos) {
801
+ // upper limit for the number of tokens
802
+ int n_tokens = text.length() + add_bos;
803
+ std::vector<llama_token> result(n_tokens);
804
+ n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
805
+ if (n_tokens < 0) {
806
+ result.resize(-n_tokens);
807
+ int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
808
+ GGML_ASSERT(check == -n_tokens);
809
+ } else {
810
+ result.resize(n_tokens);
811
+ }
812
+ return result;
813
+ }
814
+
815
+ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
816
+ std::vector<char> result(8, 0);
817
+ const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
818
+ if (n_tokens < 0) {
819
+ result.resize(-n_tokens);
820
+ int check = llama_token_to_piece(ctx, token, result.data(), result.size());
821
+ GGML_ASSERT(check == -n_tokens);
822
+ } else {
823
+ result.resize(n_tokens);
824
+ }
825
+
826
+ return std::string(result.data(), result.size());
827
+ }
828
+
829
+ std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
830
+ const llama_token bos_id = llama_token_bos(ctx);
831
+
832
+ std::string piece;
833
+ std::string result;
834
+
835
+ for (size_t i = 0; i < tokens.size(); ++i) {
836
+ piece = llama_token_to_piece(ctx, tokens[i]);
837
+
838
+ // remove the leading space of the first non-BOS token
839
+ if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
840
+ piece = piece.substr(1);
841
+ }
842
+
843
+ result += piece;
844
+ }
845
+
846
+ return result;
847
+ }
848
+
849
+ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
850
+ std::string piece;
851
+ std::string result;
852
+
853
+ for (size_t i = 0; i < tokens.size(); ++i) {
854
+ piece = llama_token_to_piece(ctx, tokens[i]);
855
+
856
+ result += piece;
857
+ }
858
+
859
+ return result;
860
+ }
861
+
862
+ //
863
+ // Sampling utils
864
+ //
865
+
866
+ llama_token llama_sample_token(
867
+ struct llama_context * ctx,
868
+ struct llama_context * ctx_guidance,
869
+ struct llama_grammar * grammar,
870
+ const struct gpt_params & params,
871
+ const std::vector<llama_token> & last_tokens,
872
+ std::vector<llama_token_data> & candidates,
873
+ int idx) {
874
+ const int n_ctx = llama_n_ctx(ctx);
875
+ const int n_vocab = llama_n_vocab(ctx);
876
+
877
+ const float temp = params.temp;
878
+ const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
879
+ const float top_p = params.top_p;
880
+ const float tfs_z = params.tfs_z;
881
+ const float typical_p = params.typical_p;
882
+ const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
883
+ const float repeat_penalty = params.repeat_penalty;
884
+ const float alpha_presence = params.presence_penalty;
885
+ const float alpha_frequency = params.frequency_penalty;
886
+ const int mirostat = params.mirostat;
887
+ const float mirostat_tau = params.mirostat_tau;
888
+ const float mirostat_eta = params.mirostat_eta;
889
+ const bool penalize_nl = params.penalize_nl;
890
+
891
+ llama_token id = 0;
892
+
893
+ float * logits = llama_get_logits(ctx) + idx * n_vocab;
894
+
895
+ // Apply params.logit_bias map
896
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
897
+ logits[it->first] += it->second;
898
+ }
899
+
900
+ candidates.clear();
901
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
902
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
903
+ }
904
+
905
+ llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
906
+
907
+ if (ctx_guidance) {
908
+ llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
909
+ }
910
+
911
+ // apply penalties
912
+ if (!last_tokens.empty()) {
913
+ const float nl_logit = logits[llama_token_nl(ctx)];
914
+ const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
915
+
916
+ llama_sample_repetition_penalty(ctx, &cur_p,
917
+ last_tokens.data() + last_tokens.size() - last_n_repeat,
918
+ last_n_repeat, repeat_penalty);
919
+ llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
920
+ last_tokens.data() + last_tokens.size() - last_n_repeat,
921
+ last_n_repeat, alpha_frequency, alpha_presence);
922
+
923
+ if (!penalize_nl) {
924
+ for (size_t idx = 0; idx < cur_p.size; idx++) {
925
+ if (cur_p.data[idx].id == llama_token_nl(ctx)) {
926
+ cur_p.data[idx].logit = nl_logit;
927
+ break;
928
+ }
929
+ }
930
+ }
931
+ }
932
+
933
+ if (grammar != NULL) {
934
+ llama_sample_grammar(ctx, &cur_p, grammar);
935
+ }
936
+
937
+ if (temp <= 0) {
938
+ // Greedy sampling
939
+ id = llama_sample_token_greedy(ctx, &cur_p);
940
+ } else {
941
+ if (mirostat == 1) {
942
+ static float mirostat_mu = 2.0f * mirostat_tau;
943
+ const int mirostat_m = 100;
944
+ llama_sample_temperature(ctx, &cur_p, temp);
945
+ id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
946
+ } else if (mirostat == 2) {
947
+ static float mirostat_mu = 2.0f * mirostat_tau;
948
+ llama_sample_temperature(ctx, &cur_p, temp);
949
+ id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
950
+ } else {
951
+ // Temperature sampling
952
+ llama_sample_top_k (ctx, &cur_p, top_k, 1);
953
+ llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
954
+ llama_sample_typical (ctx, &cur_p, typical_p, 1);
955
+ llama_sample_top_p (ctx, &cur_p, top_p, 1);
956
+ llama_sample_temperature(ctx, &cur_p, temp);
957
+
958
+ {
959
+ const int n_top = 10;
960
+ LOG("top %d candidates:\n", n_top);
961
+
962
+ for (int i = 0; i < n_top; i++) {
963
+ const llama_token id = cur_p.data[i].id;
964
+ LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
965
+ }
966
+ }
967
+
968
+ id = llama_sample_token(ctx, &cur_p);
969
+
970
+ LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
971
+ }
972
+ }
973
+ // printf("`%d`", candidates_p.size);
974
+
975
+ if (grammar != NULL) {
976
+ llama_grammar_accept_token(ctx, grammar, id);
977
+ }
978
+
979
+ return id;
980
+ }
981
+
982
+ //
983
+ // YAML utils
984
+ //
985
+
986
+ // returns true if successful, false otherwise
987
+ bool create_directory_with_parents(const std::string & path) {
988
+ #ifdef _WIN32
989
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
990
+ std::wstring wpath = converter.from_bytes(path);
991
+
992
+ // if the path already exists, check whether it's a directory
993
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
994
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
995
+ return true;
996
+ }
997
+
998
+ size_t pos_slash = 0;
999
+
1000
+ // process path from front to back, procedurally creating directories
1001
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
1002
+ const std::wstring subpath = wpath.substr(0, pos_slash);
1003
+ const wchar_t * test = subpath.c_str();
1004
+
1005
+ const bool success = CreateDirectoryW(test, NULL);
1006
+ if (!success) {
1007
+ const DWORD error = GetLastError();
1008
+
1009
+ // if the path already exists, ensure that it's a directory
1010
+ if (error == ERROR_ALREADY_EXISTS) {
1011
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
1012
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
1013
+ return false;
1014
+ }
1015
+ } else {
1016
+ return false;
1017
+ }
1018
+ }
1019
+
1020
+ pos_slash += 1;
1021
+ }
1022
+
1023
+ return true;
1024
+ #else
1025
+ // if the path already exists, check whether it's a directory
1026
+ struct stat info;
1027
+ if (stat(path.c_str(), &info) == 0) {
1028
+ return S_ISDIR(info.st_mode);
1029
+ }
1030
+
1031
+ size_t pos_slash = 1; // skip leading slashes for directory creation
1032
+
1033
+ // process path from front to back, procedurally creating directories
1034
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
1035
+ const std::string subpath = path.substr(0, pos_slash);
1036
+ struct stat info;
1037
+
1038
+ // if the path already exists, ensure that it's a directory
1039
+ if (stat(subpath.c_str(), &info) == 0) {
1040
+ if (!S_ISDIR(info.st_mode)) {
1041
+ return false;
1042
+ }
1043
+ } else {
1044
+ // create parent directories
1045
+ const int ret = mkdir(subpath.c_str(), 0755);
1046
+ if (ret != 0) {
1047
+ return false;
1048
+ }
1049
+ }
1050
+
1051
+ pos_slash += 1;
1052
+ }
1053
+
1054
+ return true;
1055
+ #endif // _WIN32
1056
+ }
1057
+
1058
+ void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
1059
+ if (data.empty()) {
1060
+ fprintf(stream, "%s:\n", prop_name);
1061
+ return;
1062
+ }
1063
+
1064
+ fprintf(stream, "%s: [", prop_name);
1065
+ for (size_t i = 0; i < data.size() - 1; ++i) {
1066
+ fprintf(stream, "%e, ", data[i]);
1067
+ }
1068
+ fprintf(stream, "%e]\n", data.back());
1069
+ }
1070
+
1071
+ void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
1072
+ if (data.empty()) {
1073
+ fprintf(stream, "%s:\n", prop_name);
1074
+ return;
1075
+ }
1076
+
1077
+ fprintf(stream, "%s: [", prop_name);
1078
+ for (size_t i = 0; i < data.size() - 1; ++i) {
1079
+ fprintf(stream, "%d, ", data[i]);
1080
+ }
1081
+ fprintf(stream, "%d]\n", data.back());
1082
+ }
1083
+
1084
+ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
1085
+ std::string data_str(data == NULL ? "" : data);
1086
+
1087
+ if (data_str.empty()) {
1088
+ fprintf(stream, "%s:\n", prop_name);
1089
+ return;
1090
+ }
1091
+
1092
+ size_t pos_start = 0;
1093
+ size_t pos_found = 0;
1094
+
1095
+ if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
1096
+ data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
1097
+ data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
1098
+ data_str = "\"" + data_str + "\"";
1099
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1100
+ return;
1101
+ }
1102
+
1103
+ if (data_str.find('\n') == std::string::npos) {
1104
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
1105
+ return;
1106
+ }
1107
+
1108
+ fprintf(stream, "%s: |\n", prop_name);
1109
+ while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
1110
+ fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
1111
+ pos_start = pos_found + 1;
1112
+ }
1113
+ }
1114
+
1115
+ std::string get_sortable_timestamp() {
1116
+ using clock = std::chrono::system_clock;
1117
+
1118
+ const clock::time_point current_time = clock::now();
1119
+ const time_t as_time_t = clock::to_time_t(current_time);
1120
+ char timestamp_no_ns[100];
1121
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
1122
+
1123
+ const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
1124
+ current_time.time_since_epoch() % 1000000000).count();
1125
+ char timestamp_ns[11];
1126
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
1127
+
1128
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
1129
+ }
1130
+
1131
+ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
1132
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1133
+ fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
1134
+ fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
1135
+ fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
1136
+ fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
1137
+ fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
1138
+ fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
1139
+ fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
1140
+ fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
1141
+ fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
1142
+ fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
1143
+ fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
1144
+ fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
1145
+ fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
1146
+ fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
1147
+ fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
1148
+ fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
1149
+ fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
1150
+ fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
1151
+ fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
1152
+ fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
1153
+
1154
+ #ifdef NDEBUG
1155
+ fprintf(stream, "debug: false\n");
1156
+ #else
1157
+ fprintf(stream, "debug: true\n");
1158
+ #endif // NDEBUG
1159
+
1160
+ fprintf(stream, "model_desc: %s\n", model_desc);
1161
+ fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));
1162
+
1163
+ #ifdef __OPTIMIZE__
1164
+ fprintf(stream, "optimize: true\n");
1165
+ #else
1166
+ fprintf(stream, "optimize: false\n");
1167
+ #endif // __OPTIMIZE__
1168
+
1169
+ fprintf(stream, "time: %s\n", timestamp.c_str());
1170
+
1171
+ fprintf(stream, "\n");
1172
+ fprintf(stream, "###############\n");
1173
+ fprintf(stream, "# User Inputs #\n");
1174
+ fprintf(stream, "###############\n");
1175
+ fprintf(stream, "\n");
1176
+
1177
+ fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
1178
+ fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
1179
+ dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
1180
+ fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
1181
+ fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
1182
+ fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
1183
+ fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
1184
+ fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
1185
+ fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
1186
+ fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
1187
+ fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
1188
+ dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
1189
+ fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
1190
+ fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
1191
+ fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
1192
+
1193
+ const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
1194
+ const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
1195
+ fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
1196
+
1197
+ dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
1198
+ fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
1199
+ dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
1200
+ fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
1201
+ fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
1202
+ fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
1203
+ fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
1204
+ fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
1205
+
1206
+ fprintf(stream, "logit_bias:\n");
1207
+ for (std::pair<llama_token, float> lb : params.logit_bias) {
1208
+ if (ignore_eos && lb.first == logit_bias_eos->first) {
1209
+ continue;
1210
+ }
1211
+ fprintf(stream, " %d: %f", lb.first, lb.second);
1212
+ }
1213
+
1214
+ fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
1215
+ fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
1216
+ fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
1217
+ fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1218
+ fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
1219
+ fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
1220
+ fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
1221
+ fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
1222
+ fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
1223
+ fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
1224
+ fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
1225
+ fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
1226
+ fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
1227
+ fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
1228
+ fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
1229
+ fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
1230
+ fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
1231
+ fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
1232
+ fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
1233
+ fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
1234
+ fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
1235
+ fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
1236
+ dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
1237
+ fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
1238
+ fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
1239
+ fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
1240
+ dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
1241
+ fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
1242
+ fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
1243
+
1244
+ fprintf(stream, "reverse_prompt:\n");
1245
+ for (std::string ap : params.antiprompt) {
1246
+ size_t pos = 0;
1247
+ while ((pos = ap.find('\n', pos)) != std::string::npos) {
1248
+ ap.replace(pos, 1, "\\n");
1249
+ pos += 1;
1250
+ }
1251
+
1252
+ fprintf(stream, " - %s\n", ap.c_str());
1253
+ }
1254
+
1255
+ fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
1256
+ fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
1257
+ fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
1258
+ fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
1259
+ fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
1260
+
1261
+ const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
1262
+ dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
1263
+
1264
+ fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
1265
+ fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
1266
+ fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
1267
+ fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
1268
+ fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
1269
+ fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
1270
+ }
common/common.h ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama.h"
6
+
7
+ #define LOG_NO_FILE_LINE_FUNCTION
8
+ #include "log.h"
9
+
10
+ #include <string>
11
+ #include <vector>
12
+ #include <random>
13
+ #include <thread>
14
+ #include <unordered_map>
15
+ #include <tuple>
16
+
17
+ #ifdef _WIN32
18
+ #define DIRECTORY_SEPARATOR '\\'
19
+ #else
20
+ #define DIRECTORY_SEPARATOR '/'
21
+ #endif // _WIN32
22
+
23
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
24
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
25
+
26
+ #define print_build_info() do { \
27
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
28
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
29
+ } while(0)
30
+
31
+ //
32
+ // CLI argument parsing
33
+ //
34
+ int32_t get_num_physical_cores();
35
+
36
+ struct gpt_params {
37
+ uint32_t seed = -1; // RNG seed
38
+ int32_t n_threads = get_num_physical_cores();
39
+ int32_t n_predict = -1; // new tokens to predict
40
+ int32_t n_ctx = 512; // context size
41
+ int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
42
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
43
+ int32_t n_draft = 16; // number of tokens to draft during speculative decoding
44
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
45
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
46
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
47
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
48
+ float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
49
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
50
+ int32_t n_beams = 0; // if non-zero then use beam search of given width.
51
+ float rope_freq_base = 10000.0f; // RoPE base frequency
52
+ float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
53
+
54
+ // sampling parameters
55
+ int32_t top_k = 40; // <= 0 to use vocab size
56
+ float top_p = 0.95f; // 1.0 = disabled
57
+ float tfs_z = 1.00f; // 1.0 = disabled
58
+ float typical_p = 1.00f; // 1.0 = disabled
59
+ float temp = 0.80f; // 1.0 = disabled
60
+ float repeat_penalty = 1.10f; // 1.0 = disabled
61
+ int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
62
+ float frequency_penalty = 0.00f; // 0.0 = disabled
63
+ float presence_penalty = 0.00f; // 0.0 = disabled
64
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
65
+ float mirostat_tau = 5.00f; // target entropy
66
+ float mirostat_eta = 0.10f; // learning rate
67
+
68
+ std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
69
+
70
+ // Classifier-Free Guidance
71
+ // https://arxiv.org/abs/2306.17806
72
+ std::string cfg_negative_prompt; // string to help guidance
73
+ float cfg_scale = 1.f; // How strong is guidance
74
+
75
+ std::string model = "models/7B/ggml-model-f16.gguf"; // model path
76
+ std::string model_draft = ""; // draft model for speculative decoding
77
+ std::string model_alias = "unknown"; // model alias
78
+ std::string prompt = "";
79
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
80
+ std::string input_prefix = ""; // string to prefix user inputs with
81
+ std::string input_suffix = ""; // string to suffix user inputs with
82
+ std::string grammar = ""; // optional BNF-like grammar to constrain sampling
83
+ std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
84
+ std::string logdir = ""; // directory in which to save YAML log files
85
+
86
+ std::string lora_adapter = ""; // lora adapter path
87
+ std::string lora_base = ""; // base model path for the lora adapter
88
+
89
+ int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
90
+ int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
91
+ // (which is more convenient to use for plotting)
92
+ //
93
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
94
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
95
+
96
+ bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
97
+ bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
98
+ bool memory_f16 = true; // use f16 instead of f32 for memory kv
99
+ bool random_prompt = false; // do not randomize prompt if none provided
100
+ bool use_color = false; // use color to distinguish generations and inputs
101
+ bool interactive = false; // interactive mode
102
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
103
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
104
+
105
+ bool embedding = false; // get only sentence embedding
106
+ bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
107
+ bool interactive_first = false; // wait for user input immediately
108
+ bool multiline_input = false; // reverse the usage of `\`
109
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
110
+
111
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
112
+ bool ignore_eos = false; // ignore generated EOS tokens
113
+ bool instruct = false; // instruction mode (used for Alpaca models)
114
+ bool penalize_nl = true; // consider newlines as a repeatable token
115
+ bool perplexity = false; // compute perplexity over the prompt
116
+ bool use_mmap = true; // use mmap for faster loads
117
+ bool use_mlock = false; // use mlock to keep model in memory
118
+ bool numa = false; // attempt optimizations that help on some NUMA systems
119
+ bool export_cgraph = false; // export the computation graph
120
+ bool verbose_prompt = false; // print prompt tokens before generation
121
+ };
122
+
123
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
124
+
125
+ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
126
+
127
+ std::string gpt_random_prompt(std::mt19937 & rng);
128
+
129
+ //
130
+ // Model utils
131
+ //
132
+
133
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
134
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
135
+
136
+ //
137
+ // Vocab utils
138
+ //
139
+
140
+ // tokenizes a string into a vector of tokens
141
+ // should work similar to Python's `tokenizer.encode`
142
+ std::vector<llama_token> llama_tokenize(
143
+ struct llama_context * ctx,
144
+ const std::string & text,
145
+ bool add_bos);
146
+
147
+ // tokenizes a token into a piece
148
+ // should work similar to Python's `tokenizer.id_to_piece`
149
+ std::string llama_token_to_piece(
150
+ const struct llama_context * ctx,
151
+ llama_token token);
152
+
153
+ // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
154
+ // that takes into account the tokenizer type and decides how to handle the leading space
155
+ //
156
+ // detokenizes a vector of tokens into a string
157
+ // should work similar to Python's `tokenizer.decode`
158
+ // removes the leading space from the first non-BOS token
159
+ std::string llama_detokenize_spm(
160
+ llama_context * ctx,
161
+ const std::vector<llama_token> & tokens);
162
+
163
+ // detokenizes a vector of tokens into a string
164
+ // should work similar to Python's `tokenizer.decode`
165
+ std::string llama_detokenize_bpe(
166
+ llama_context * ctx,
167
+ const std::vector<llama_token> & tokens);
168
+
169
+ //
170
+ // Sampling utils
171
+ //
172
+
173
+ // this is a common sampling function used across the examples for convenience
174
+ // it can serve as a starting point for implementing your own sampling function
175
+ //
176
+ // required:
177
+ // - ctx: context to use for sampling
178
+ // - params: sampling parameters
179
+ //
180
+ // optional:
181
+ // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
182
+ // - grammar: grammar to use for sampling, ignore if NULL
183
+ // - last_tokens: needed for repetition penalty, ignore if empty
184
+ // - idx: sample from llama_get_logits(ctx) + idx * n_vocab
185
+ //
186
+ // returns:
187
+ // - token: sampled token
188
+ // - candidates: vector of candidate tokens
189
+ //
190
+ llama_token llama_sample_token(
191
+ struct llama_context * ctx,
192
+ struct llama_context * ctx_guidance,
193
+ struct llama_grammar * grammar,
194
+ const struct gpt_params & params,
195
+ const std::vector<llama_token> & last_tokens,
196
+ std::vector<llama_token_data> & candidates,
197
+ int idx = 0);
198
+
199
+ //
200
+ // YAML utils
201
+ //
202
+
203
+ bool create_directory_with_parents(const std::string & path);
204
+ void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
205
+ void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
206
+ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
207
+ std::string get_sortable_timestamp();
208
+
209
+ void dump_non_result_info_yaml(
210
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
211
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
common/console.cpp ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "console.h"
2
+ #include <vector>
3
+ #include <iostream>
4
+
5
+ #if defined(_WIN32)
6
+ #define WIN32_LEAN_AND_MEAN
7
+ #ifndef NOMINMAX
8
+ #define NOMINMAX
9
+ #endif
10
+ #include <windows.h>
11
+ #include <fcntl.h>
12
+ #include <io.h>
13
+ #ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
14
+ #define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
15
+ #endif
16
+ #else
17
+ #include <climits>
18
+ #include <sys/ioctl.h>
19
+ #include <unistd.h>
20
+ #include <wchar.h>
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <signal.h>
24
+ #include <termios.h>
25
+ #endif
26
+
27
+ #define ANSI_COLOR_RED "\x1b[31m"
28
+ #define ANSI_COLOR_GREEN "\x1b[32m"
29
+ #define ANSI_COLOR_YELLOW "\x1b[33m"
30
+ #define ANSI_COLOR_BLUE "\x1b[34m"
31
+ #define ANSI_COLOR_MAGENTA "\x1b[35m"
32
+ #define ANSI_COLOR_CYAN "\x1b[36m"
33
+ #define ANSI_COLOR_RESET "\x1b[0m"
34
+ #define ANSI_BOLD "\x1b[1m"
35
+
36
+ namespace console {
37
+
38
+ //
39
+ // Console state
40
+ //
41
+
42
+ static bool advanced_display = false;
43
+ static bool simple_io = true;
44
+ static display_t current_display = reset;
45
+
46
+ static FILE* out = stdout;
47
+
48
+ #if defined (_WIN32)
49
+ static void* hConsole;
50
+ #else
51
+ static FILE* tty = nullptr;
52
+ static termios initial_state;
53
+ #endif
54
+
55
+ //
56
+ // Init and cleanup
57
+ //
58
+
59
+ void init(bool use_simple_io, bool use_advanced_display) {
60
+ advanced_display = use_advanced_display;
61
+ simple_io = use_simple_io;
62
+ #if defined(_WIN32)
63
+ // Windows-specific console initialization
64
+ DWORD dwMode = 0;
65
+ hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
66
+ if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
67
+ hConsole = GetStdHandle(STD_ERROR_HANDLE);
68
+ if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
69
+ hConsole = nullptr;
70
+ simple_io = true;
71
+ }
72
+ }
73
+ if (hConsole) {
74
+ // Check conditions combined to reduce nesting
75
+ if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
76
+ !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
77
+ advanced_display = false;
78
+ }
79
+ // Set console output codepage to UTF8
80
+ SetConsoleOutputCP(CP_UTF8);
81
+ }
82
+ HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
83
+ if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
84
+ // Set console input codepage to UTF16
85
+ _setmode(_fileno(stdin), _O_WTEXT);
86
+
87
+ // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
88
+ if (simple_io) {
89
+ dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
90
+ } else {
91
+ dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
92
+ }
93
+ if (!SetConsoleMode(hConIn, dwMode)) {
94
+ simple_io = true;
95
+ }
96
+ }
97
+ #else
98
+ // POSIX-specific console initialization
99
+ if (!simple_io) {
100
+ struct termios new_termios;
101
+ tcgetattr(STDIN_FILENO, &initial_state);
102
+ new_termios = initial_state;
103
+ new_termios.c_lflag &= ~(ICANON | ECHO);
104
+ new_termios.c_cc[VMIN] = 1;
105
+ new_termios.c_cc[VTIME] = 0;
106
+ tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
107
+
108
+ tty = fopen("/dev/tty", "w+");
109
+ if (tty != nullptr) {
110
+ out = tty;
111
+ }
112
+ }
113
+
114
+ setlocale(LC_ALL, "");
115
+ #endif
116
+ }
117
+
118
+ void cleanup() {
119
+ // Reset console display
120
+ set_display(reset);
121
+
122
+ #if !defined(_WIN32)
123
+ // Restore settings on POSIX systems
124
+ if (!simple_io) {
125
+ if (tty != nullptr) {
126
+ out = stdout;
127
+ fclose(tty);
128
+ tty = nullptr;
129
+ }
130
+ tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
131
+ }
132
+ #endif
133
+ }
134
+
135
+ //
136
+ // Display and IO
137
+ //
138
+
139
+ // Keep track of current display and only emit ANSI code if it changes
140
+ void set_display(display_t display) {
141
+ if (advanced_display && current_display != display) {
142
+ fflush(stdout);
143
+ switch(display) {
144
+ case reset:
145
+ fprintf(out, ANSI_COLOR_RESET);
146
+ break;
147
+ case prompt:
148
+ fprintf(out, ANSI_COLOR_YELLOW);
149
+ break;
150
+ case user_input:
151
+ fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
152
+ break;
153
+ case error:
154
+ fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
155
+ }
156
+ current_display = display;
157
+ fflush(out);
158
+ }
159
+ }
160
+
161
+ static char32_t getchar32() {
162
+ #if defined(_WIN32)
163
+ HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
164
+ wchar_t high_surrogate = 0;
165
+
166
+ while (true) {
167
+ INPUT_RECORD record;
168
+ DWORD count;
169
+ if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
170
+ return WEOF;
171
+ }
172
+
173
+ if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
174
+ wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
175
+ if (wc == 0) {
176
+ continue;
177
+ }
178
+
179
+ if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
180
+ high_surrogate = wc;
181
+ continue;
182
+ }
183
+ if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
184
+ if (high_surrogate != 0) { // Check if we have a high surrogate
185
+ return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
186
+ }
187
+ }
188
+
189
+ high_surrogate = 0; // Reset the high surrogate
190
+ return static_cast<char32_t>(wc);
191
+ }
192
+ }
193
+ #else
194
+ wchar_t wc = getwchar();
195
+ if (static_cast<wint_t>(wc) == WEOF) {
196
+ return WEOF;
197
+ }
198
+
199
+ #if WCHAR_MAX == 0xFFFF
200
+ if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
201
+ wchar_t low_surrogate = getwchar();
202
+ if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
203
+ return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
204
+ }
205
+ }
206
+ if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
207
+ return 0xFFFD; // Return the replacement character U+FFFD
208
+ }
209
+ #endif
210
+
211
+ return static_cast<char32_t>(wc);
212
+ #endif
213
+ }
214
+
215
+ static void pop_cursor() {
216
+ #if defined(_WIN32)
217
+ if (hConsole != NULL) {
218
+ CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
219
+ GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
220
+
221
+ COORD newCursorPosition = bufferInfo.dwCursorPosition;
222
+ if (newCursorPosition.X == 0) {
223
+ newCursorPosition.X = bufferInfo.dwSize.X - 1;
224
+ newCursorPosition.Y -= 1;
225
+ } else {
226
+ newCursorPosition.X -= 1;
227
+ }
228
+
229
+ SetConsoleCursorPosition(hConsole, newCursorPosition);
230
+ return;
231
+ }
232
+ #endif
233
+ putc('\b', out);
234
+ }
235
+
236
+ static int estimateWidth(char32_t codepoint) {
237
+ #if defined(_WIN32)
238
+ (void)codepoint;
239
+ return 1;
240
+ #else
241
+ return wcwidth(codepoint);
242
+ #endif
243
+ }
244
+
245
+ static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
246
+ #if defined(_WIN32)
247
+ CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
248
+ if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
249
+ // go with the default
250
+ return expectedWidth;
251
+ }
252
+ COORD initialPosition = bufferInfo.dwCursorPosition;
253
+ DWORD nNumberOfChars = length;
254
+ WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
255
+
256
+ CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
257
+ GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
258
+
259
+ // Figure out our real position if we're in the last column
260
+ if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
261
+ DWORD nNumberOfChars;
262
+ WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
263
+ GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
264
+ }
265
+
266
+ int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
267
+ if (width < 0) {
268
+ width += newBufferInfo.dwSize.X;
269
+ }
270
+ return width;
271
+ #else
272
+ // We can trust expectedWidth if we've got one
273
+ if (expectedWidth >= 0 || tty == nullptr) {
274
+ fwrite(utf8_codepoint, length, 1, out);
275
+ return expectedWidth;
276
+ }
277
+
278
+ fputs("\033[6n", tty); // Query cursor position
279
+ int x1;
280
+ int y1;
281
+ int x2;
282
+ int y2;
283
+ int results = 0;
284
+ results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
285
+
286
+ fwrite(utf8_codepoint, length, 1, tty);
287
+
288
+ fputs("\033[6n", tty); // Query cursor position
289
+ results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
290
+
291
+ if (results != 4) {
292
+ return expectedWidth;
293
+ }
294
+
295
+ int width = x2 - x1;
296
+ if (width < 0) {
297
+ // Calculate the width considering text wrapping
298
+ struct winsize w;
299
+ ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
300
+ width += w.ws_col;
301
+ }
302
+ return width;
303
+ #endif
304
+ }
305
+
306
+ static void replace_last(char ch) {
307
+ #if defined(_WIN32)
308
+ pop_cursor();
309
+ put_codepoint(&ch, 1, 1);
310
+ #else
311
+ fprintf(out, "\b%c", ch);
312
+ #endif
313
+ }
314
+
315
+ static void append_utf8(char32_t ch, std::string & out) {
316
+ if (ch <= 0x7F) {
317
+ out.push_back(static_cast<unsigned char>(ch));
318
+ } else if (ch <= 0x7FF) {
319
+ out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
320
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
321
+ } else if (ch <= 0xFFFF) {
322
+ out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
323
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
324
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
325
+ } else if (ch <= 0x10FFFF) {
326
+ out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
327
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
328
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
329
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
330
+ } else {
331
+ // Invalid Unicode code point
332
+ }
333
+ }
334
+
335
+ // Helper function to remove the last UTF-8 character from a string
336
+ static void pop_back_utf8_char(std::string & line) {
337
+ if (line.empty()) {
338
+ return;
339
+ }
340
+
341
+ size_t pos = line.length() - 1;
342
+
343
+ // Find the start of the last UTF-8 character (checking up to 4 bytes back)
344
+ for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
345
+ if ((line[pos] & 0xC0) != 0x80) {
346
+ break; // Found the start of the character
347
+ }
348
+ }
349
+ line.erase(pos);
350
+ }
351
+
352
+ static bool readline_advanced(std::string & line, bool multiline_input) {
353
+ if (out != stdout) {
354
+ fflush(stdout);
355
+ }
356
+
357
+ line.clear();
358
+ std::vector<int> widths;
359
+ bool is_special_char = false;
360
+ bool end_of_stream = false;
361
+
362
+ char32_t input_char;
363
+ while (true) {
364
+ fflush(out); // Ensure all output is displayed before waiting for input
365
+ input_char = getchar32();
366
+
367
+ if (input_char == '\r' || input_char == '\n') {
368
+ break;
369
+ }
370
+
371
+ if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
372
+ end_of_stream = true;
373
+ break;
374
+ }
375
+
376
+ if (is_special_char) {
377
+ set_display(user_input);
378
+ replace_last(line.back());
379
+ is_special_char = false;
380
+ }
381
+
382
+ if (input_char == '\033') { // Escape sequence
383
+ char32_t code = getchar32();
384
+ if (code == '[' || code == 0x1B) {
385
+ // Discard the rest of the escape sequence
386
+ while ((code = getchar32()) != (char32_t) WEOF) {
387
+ if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
388
+ break;
389
+ }
390
+ }
391
+ }
392
+ } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
393
+ if (!widths.empty()) {
394
+ int count;
395
+ do {
396
+ count = widths.back();
397
+ widths.pop_back();
398
+ // Move cursor back, print space, and move cursor back again
399
+ for (int i = 0; i < count; i++) {
400
+ replace_last(' ');
401
+ pop_cursor();
402
+ }
403
+ pop_back_utf8_char(line);
404
+ } while (count == 0 && !widths.empty());
405
+ }
406
+ } else {
407
+ int offset = line.length();
408
+ append_utf8(input_char, line);
409
+ int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
410
+ if (width < 0) {
411
+ width = 0;
412
+ }
413
+ widths.push_back(width);
414
+ }
415
+
416
+ if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
417
+ set_display(prompt);
418
+ replace_last(line.back());
419
+ is_special_char = true;
420
+ }
421
+ }
422
+
423
+ bool has_more = multiline_input;
424
+ if (is_special_char) {
425
+ replace_last(' ');
426
+ pop_cursor();
427
+
428
+ char last = line.back();
429
+ line.pop_back();
430
+ if (last == '\\') {
431
+ line += '\n';
432
+ fputc('\n', out);
433
+ has_more = !has_more;
434
+ } else {
435
+ // llama will just eat the single space, it won't act as a space
436
+ if (line.length() == 1 && line.back() == ' ') {
437
+ line.clear();
438
+ pop_cursor();
439
+ }
440
+ has_more = false;
441
+ }
442
+ } else {
443
+ if (end_of_stream) {
444
+ has_more = false;
445
+ } else {
446
+ line += '\n';
447
+ fputc('\n', out);
448
+ }
449
+ }
450
+
451
+ fflush(out);
452
+ return has_more;
453
+ }
454
+
455
+ static bool readline_simple(std::string & line, bool multiline_input) {
456
+ #if defined(_WIN32)
457
+ std::wstring wline;
458
+ if (!std::getline(std::wcin, wline)) {
459
+ // Input stream is bad or EOF received
460
+ line.clear();
461
+ GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
462
+ return false;
463
+ }
464
+
465
+ int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
466
+ line.resize(size_needed);
467
+ WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
468
+ #else
469
+ if (!std::getline(std::cin, line)) {
470
+ // Input stream is bad or EOF received
471
+ line.clear();
472
+ return false;
473
+ }
474
+ #endif
475
+ if (!line.empty()) {
476
+ char last = line.back();
477
+ if (last == '/') { // Always return control on '/' symbol
478
+ line.pop_back();
479
+ return false;
480
+ }
481
+ if (last == '\\') { // '\\' changes the default action
482
+ line.pop_back();
483
+ multiline_input = !multiline_input;
484
+ }
485
+ }
486
+ line += '\n';
487
+
488
+ // By default, continue input if multiline_input is set
489
+ return multiline_input;
490
+ }
491
+
492
+ bool readline(std::string & line, bool multiline_input) {
493
+ set_display(user_input);
494
+
495
+ if (simple_io) {
496
+ return readline_simple(line, multiline_input);
497
+ }
498
+ return readline_advanced(line, multiline_input);
499
+ }
500
+
501
+ }
common/console.h ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Console functions
2
+
3
+ #pragma once
4
+
5
+ #include <string>
6
+
7
+ namespace console {
8
+ enum display_t {
9
+ reset = 0,
10
+ prompt,
11
+ user_input,
12
+ error
13
+ };
14
+
15
+ void init(bool use_simple_io, bool use_advanced_display);
16
+ void cleanup();
17
+ void set_display(display_t display);
18
+ bool readline(std::string & line, bool multiline_input);
19
+ }
common/grammar-parser.cpp ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "grammar-parser.h"
2
+ #include <cstdint>
3
+ #include <cwchar>
4
+ #include <string>
5
+ #include <utility>
6
+ #include <stdexcept>
7
+ #include <exception>
8
+
9
+ namespace grammar_parser {
10
+ // NOTE: assumes valid utf8 (but checks for overrun)
11
+ // copied from llama.cpp
12
+ static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
13
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
14
+ uint8_t first_byte = static_cast<uint8_t>(*src);
15
+ uint8_t highbits = first_byte >> 4;
16
+ int len = lookup[highbits];
17
+ uint8_t mask = (1 << (8 - len)) - 1;
18
+ uint32_t value = first_byte & mask;
19
+ const char * end = src + len; // may overrun!
20
+ const char * pos = src + 1;
21
+ for ( ; pos < end && *pos; pos++) {
22
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
23
+ }
24
+ return std::make_pair(value, pos);
25
+ }
26
+
27
+ static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
28
+ uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
29
+ auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
30
+ return result.first->second;
31
+ }
32
+
33
+ static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
34
+ uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
35
+ state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
36
+ return next_id;
37
+ }
38
+
39
+ static void add_rule(
40
+ parse_state & state,
41
+ uint32_t rule_id,
42
+ const std::vector<llama_grammar_element> & rule) {
43
+ if (state.rules.size() <= rule_id) {
44
+ state.rules.resize(rule_id + 1);
45
+ }
46
+ state.rules[rule_id] = rule;
47
+ }
48
+
49
+ static bool is_word_char(char c) {
50
+ return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
51
+ }
52
+
53
+ static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
54
+ const char * pos = src;
55
+ const char * end = src + size;
56
+ uint32_t value = 0;
57
+ for ( ; pos < end && *pos; pos++) {
58
+ value <<= 4;
59
+ char c = *pos;
60
+ if ('a' <= c && c <= 'f') {
61
+ value += c - 'a' + 10;
62
+ } else if ('A' <= c && c <= 'F') {
63
+ value += c - 'A' + 10;
64
+ } else if ('0' <= c && c <= '9') {
65
+ value += c - '0';
66
+ } else {
67
+ break;
68
+ }
69
+ }
70
+ if (pos != end) {
71
+ throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
72
+ }
73
+ return std::make_pair(value, pos);
74
+ }
75
+
76
+ static const char * parse_space(const char * src, bool newline_ok) {
77
+ const char * pos = src;
78
+ while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
79
+ (newline_ok && (*pos == '\r' || *pos == '\n'))) {
80
+ if (*pos == '#') {
81
+ while (*pos && *pos != '\r' && *pos != '\n') {
82
+ pos++;
83
+ }
84
+ } else {
85
+ pos++;
86
+ }
87
+ }
88
+ return pos;
89
+ }
90
+
91
+ static const char * parse_name(const char * src) {
92
+ const char * pos = src;
93
+ while (is_word_char(*pos)) {
94
+ pos++;
95
+ }
96
+ if (pos == src) {
97
+ throw std::runtime_error(std::string("expecting name at ") + src);
98
+ }
99
+ return pos;
100
+ }
101
+
102
+ static std::pair<uint32_t, const char *> parse_char(const char * src) {
103
+ if (*src == '\\') {
104
+ switch (src[1]) {
105
+ case 'x': return parse_hex(src + 2, 2);
106
+ case 'u': return parse_hex(src + 2, 4);
107
+ case 'U': return parse_hex(src + 2, 8);
108
+ case 't': return std::make_pair('\t', src + 2);
109
+ case 'r': return std::make_pair('\r', src + 2);
110
+ case 'n': return std::make_pair('\n', src + 2);
111
+ case '\\':
112
+ case '"':
113
+ case '[':
114
+ case ']':
115
+ return std::make_pair(src[1], src + 2);
116
+ default:
117
+ throw std::runtime_error(std::string("unknown escape at ") + src);
118
+ }
119
+ } else if (*src) {
120
+ return decode_utf8(src);
121
+ }
122
+ throw std::runtime_error("unexpected end of input");
123
+ }
124
+
125
+ const char * parse_alternates(
126
+ parse_state & state,
127
+ const char * src,
128
+ const std::string & rule_name,
129
+ uint32_t rule_id,
130
+ bool is_nested);
131
+
132
+ static const char * parse_sequence(
133
+ parse_state & state,
134
+ const char * src,
135
+ const std::string & rule_name,
136
+ std::vector<llama_grammar_element> & out_elements,
137
+ bool is_nested) {
138
+ size_t last_sym_start = out_elements.size();
139
+ const char * pos = src;
140
+ while (*pos) {
141
+ if (*pos == '"') { // literal string
142
+ pos++;
143
+ last_sym_start = out_elements.size();
144
+ while (*pos != '"') {
145
+ auto char_pair = parse_char(pos);
146
+ pos = char_pair.second;
147
+ out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
148
+ }
149
+ pos = parse_space(pos + 1, is_nested);
150
+ } else if (*pos == '[') { // char range(s)
151
+ pos++;
152
+ enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
153
+ if (*pos == '^') {
154
+ pos++;
155
+ start_type = LLAMA_GRETYPE_CHAR_NOT;
156
+ }
157
+ last_sym_start = out_elements.size();
158
+ while (*pos != ']') {
159
+ auto char_pair = parse_char(pos);
160
+ pos = char_pair.second;
161
+ enum llama_gretype type = last_sym_start < out_elements.size()
162
+ ? LLAMA_GRETYPE_CHAR_ALT
163
+ : start_type;
164
+
165
+ out_elements.push_back({type, char_pair.first});
166
+ if (pos[0] == '-' && pos[1] != ']') {
167
+ auto endchar_pair = parse_char(pos + 1);
168
+ pos = endchar_pair.second;
169
+ out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
170
+ }
171
+ }
172
+ pos = parse_space(pos + 1, is_nested);
173
+ } else if (is_word_char(*pos)) { // rule reference
174
+ const char * name_end = parse_name(pos);
175
+ uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
176
+ pos = parse_space(name_end, is_nested);
177
+ last_sym_start = out_elements.size();
178
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
179
+ } else if (*pos == '(') { // grouping
180
+ // parse nested alternates into synthesized rule
181
+ pos = parse_space(pos + 1, true);
182
+ uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
183
+ pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
184
+ last_sym_start = out_elements.size();
185
+ // output reference to synthesized rule
186
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
187
+ if (*pos != ')') {
188
+ throw std::runtime_error(std::string("expecting ')' at ") + pos);
189
+ }
190
+ pos = parse_space(pos + 1, is_nested);
191
+ } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
192
+ if (last_sym_start == out_elements.size()) {
193
+ throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
194
+ }
195
+
196
+ // apply transformation to previous symbol (last_sym_start to end) according to
197
+ // rewrite rules:
198
+ // S* --> S' ::= S S' |
199
+ // S+ --> S' ::= S S' | S
200
+ // S? --> S' ::= S |
201
+ uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
202
+ std::vector<llama_grammar_element> sub_rule;
203
+ // add preceding symbol to generated rule
204
+ sub_rule.insert(
205
+ sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
206
+ if (*pos == '*' || *pos == '+') {
207
+ // cause generated rule to recurse
208
+ sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
209
+ }
210
+ // mark start of alternate def
211
+ sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
212
+ if (*pos == '+') {
213
+ // add preceding symbol as alternate only for '+' (otherwise empty)
214
+ sub_rule.insert(
215
+ sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
216
+ }
217
+ sub_rule.push_back({LLAMA_GRETYPE_END, 0});
218
+ add_rule(state, sub_rule_id, sub_rule);
219
+
220
+ // in original rule, replace previous symbol with reference to generated rule
221
+ out_elements.resize(last_sym_start);
222
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
223
+
224
+ pos = parse_space(pos + 1, is_nested);
225
+ } else {
226
+ break;
227
+ }
228
+ }
229
+ return pos;
230
+ }
231
+
232
+ const char * parse_alternates(
233
+ parse_state & state,
234
+ const char * src,
235
+ const std::string & rule_name,
236
+ uint32_t rule_id,
237
+ bool is_nested) {
238
+ std::vector<llama_grammar_element> rule;
239
+ const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
240
+ while (*pos == '|') {
241
+ rule.push_back({LLAMA_GRETYPE_ALT, 0});
242
+ pos = parse_space(pos + 1, true);
243
+ pos = parse_sequence(state, pos, rule_name, rule, is_nested);
244
+ }
245
+ rule.push_back({LLAMA_GRETYPE_END, 0});
246
+ add_rule(state, rule_id, rule);
247
+ return pos;
248
+ }
249
+
250
+ static const char * parse_rule(parse_state & state, const char * src) {
251
+ const char * name_end = parse_name(src);
252
+ const char * pos = parse_space(name_end, false);
253
+ size_t name_len = name_end - src;
254
+ uint32_t rule_id = get_symbol_id(state, src, name_len);
255
+ const std::string name(src, name_len);
256
+
257
+ if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
258
+ throw std::runtime_error(std::string("expecting ::= at ") + pos);
259
+ }
260
+ pos = parse_space(pos + 3, true);
261
+
262
+ pos = parse_alternates(state, pos, name, rule_id, false);
263
+
264
+ if (*pos == '\r') {
265
+ pos += pos[1] == '\n' ? 2 : 1;
266
+ } else if (*pos == '\n') {
267
+ pos++;
268
+ } else if (*pos) {
269
+ throw std::runtime_error(std::string("expecting newline or end at ") + pos);
270
+ }
271
+ return parse_space(pos, true);
272
+ }
273
+
274
+ parse_state parse(const char * src) {
275
+ try {
276
+ parse_state state;
277
+ const char * pos = parse_space(src, true);
278
+ while (*pos) {
279
+ pos = parse_rule(state, pos);
280
+ }
281
+ return state;
282
+ } catch (const std::exception & err) {
283
+ fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
284
+ return parse_state();
285
+ }
286
+ }
287
+
288
+ static void print_grammar_char(FILE * file, uint32_t c) {
289
+ if (0x20 <= c && c <= 0x7f) {
290
+ fprintf(file, "%c", static_cast<char>(c));
291
+ } else {
292
+ // cop out of encoding UTF-8
293
+ fprintf(file, "<U+%04X>", c);
294
+ }
295
+ }
296
+
297
+ static bool is_char_element(llama_grammar_element elem) {
298
+ switch (elem.type) {
299
+ case LLAMA_GRETYPE_CHAR: return true;
300
+ case LLAMA_GRETYPE_CHAR_NOT: return true;
301
+ case LLAMA_GRETYPE_CHAR_ALT: return true;
302
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
303
+ default: return false;
304
+ }
305
+ }
306
+
307
+ static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
308
+ for (auto elem : rule) {
309
+ switch (elem.type) {
310
+ case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
311
+ case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
312
+ case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
313
+ case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
314
+ case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
315
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
316
+ case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
317
+ }
318
+ switch (elem.type) {
319
+ case LLAMA_GRETYPE_END:
320
+ case LLAMA_GRETYPE_ALT:
321
+ case LLAMA_GRETYPE_RULE_REF:
322
+ fprintf(file, "(%u) ", elem.value);
323
+ break;
324
+ case LLAMA_GRETYPE_CHAR:
325
+ case LLAMA_GRETYPE_CHAR_NOT:
326
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
327
+ case LLAMA_GRETYPE_CHAR_ALT:
328
+ fprintf(file, "(\"");
329
+ print_grammar_char(file, elem.value);
330
+ fprintf(file, "\") ");
331
+ break;
332
+ }
333
+ }
334
+ fprintf(file, "\n");
335
+ }
336
+
337
+ static void print_rule(
338
+ FILE * file,
339
+ uint32_t rule_id,
340
+ const std::vector<llama_grammar_element> & rule,
341
+ const std::map<uint32_t, std::string> & symbol_id_names) {
342
+ if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
343
+ throw std::runtime_error(
344
+ "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
345
+ }
346
+ fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
347
+ for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
348
+ llama_grammar_element elem = rule[i];
349
+ switch (elem.type) {
350
+ case LLAMA_GRETYPE_END:
351
+ throw std::runtime_error(
352
+ "unexpected end of rule: " + std::to_string(rule_id) + "," +
353
+ std::to_string(i));
354
+ case LLAMA_GRETYPE_ALT:
355
+ fprintf(file, "| ");
356
+ break;
357
+ case LLAMA_GRETYPE_RULE_REF:
358
+ fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
359
+ break;
360
+ case LLAMA_GRETYPE_CHAR:
361
+ fprintf(file, "[");
362
+ print_grammar_char(file, elem.value);
363
+ break;
364
+ case LLAMA_GRETYPE_CHAR_NOT:
365
+ fprintf(file, "[^");
366
+ print_grammar_char(file, elem.value);
367
+ break;
368
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
369
+ if (i == 0 || !is_char_element(rule[i - 1])) {
370
+ throw std::runtime_error(
371
+ "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
372
+ std::to_string(rule_id) + "," + std::to_string(i));
373
+ }
374
+ fprintf(file, "-");
375
+ print_grammar_char(file, elem.value);
376
+ break;
377
+ case LLAMA_GRETYPE_CHAR_ALT:
378
+ if (i == 0 || !is_char_element(rule[i - 1])) {
379
+ throw std::runtime_error(
380
+ "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
381
+ std::to_string(rule_id) + "," + std::to_string(i));
382
+ }
383
+ print_grammar_char(file, elem.value);
384
+ break;
385
+ }
386
+ if (is_char_element(elem)) {
387
+ switch (rule[i + 1].type) {
388
+ case LLAMA_GRETYPE_CHAR_ALT:
389
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
390
+ break;
391
+ default:
392
+ fprintf(file, "] ");
393
+ }
394
+ }
395
+ }
396
+ fprintf(file, "\n");
397
+ }
398
+
399
+ void print_grammar(FILE * file, const parse_state & state) {
400
+ try {
401
+ std::map<uint32_t, std::string> symbol_id_names;
402
+ for (auto kv : state.symbol_ids) {
403
+ symbol_id_names[kv.second] = kv.first;
404
+ }
405
+ for (size_t i = 0, end = state.rules.size(); i < end; i++) {
406
+ // fprintf(file, "%zu: ", i);
407
+ // print_rule_binary(file, state.rules[i]);
408
+ print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
409
+ // fprintf(file, "\n");
410
+ }
411
+ } catch (const std::exception & err) {
412
+ fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
413
+ }
414
+ }
415
+
416
+ std::vector<const llama_grammar_element *> parse_state::c_rules() {
417
+ std::vector<const llama_grammar_element *> ret;
418
+ ret.reserve(rules.size());
419
+ for (const auto & rule : rules) {
420
+ ret.push_back(rule.data());
421
+ }
422
+ return ret;
423
+ }
424
+ }
common/grammar-parser.h ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Implements a parser for an extended Backus-Naur form (BNF), producing the
2
+ // binary context-free grammar format specified by llama.h. Supports character
3
+ // ranges, grouping, and repetition operators. As an example, a grammar for
4
+ // arithmetic might look like:
5
+ //
6
+ // root ::= expr
7
+ // expr ::= term ([-+*/] term)*
8
+ // term ::= num | "(" space expr ")" space
9
+ // num ::= [0-9]+ space
10
+ // space ::= [ \t\n]*
11
+
12
+ #pragma once
13
+ #include "llama.h"
14
+ #include <vector>
15
+ #include <map>
16
+ #include <cstdint>
17
+ #include <string>
18
+
19
+ namespace grammar_parser {
20
+ struct parse_state {
21
+ std::map<std::string, uint32_t> symbol_ids;
22
+ std::vector<std::vector<llama_grammar_element>> rules;
23
+
24
+ std::vector<const llama_grammar_element *> c_rules();
25
+ };
26
+
27
+ parse_state parse(const char * src);
28
+ void print_grammar(FILE * file, const parse_state & state);
29
+ }
common/log.h ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <chrono>
4
+ #include <cstring>
5
+ #include <sstream>
6
+ #include <iostream>
7
+ #include <thread>
8
+ #include <vector>
9
+ #include <algorithm>
10
+ #include <cinttypes>
11
+
12
+ // --------------------------------
13
+ //
14
+ // Basic usage:
15
+ //
16
+ // --------
17
+ //
18
+ // The LOG() and LOG_TEE() macros are ready to go by default
19
+ // they do not require any initialization.
20
+ //
21
+ // LOGLN() and LOG_TEELN() are variants which automatically
22
+ // include \n character at the end of the log string.
23
+ //
24
+ // LOG() behaves exactly like printf, by default writing to a logfile.
25
+ // LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
26
+ //
27
+ // Default logfile is named
28
+ // "llama.<threadID>.log"
29
+ // Default LOG_TEE() secondary output target is
30
+ // stderr
31
+ //
32
+ // Logs can be dynamically disabled or enabled using functions:
33
+ // log_disable()
34
+ // and
35
+ // log_enable()
36
+ //
37
+ // A log target can be changed with:
38
+ // log_set_target( string )
39
+ // creating and opening, or re-opening a file by string filename
40
+ // or
41
+ // log_set_target( FILE* )
42
+ // allowing to point at stderr, stdout, or any valid FILE* file handler.
43
+ //
44
+ // --------
45
+ //
46
+ // End of Basic usage.
47
+ //
48
+ // --------------------------------
49
+
50
+ // Specifies a log target.
51
+ // default uses log_handler() with "llama.log" log file
52
+ // this can be changed, by defining LOG_TARGET
53
+ // like so:
54
+ //
55
+ // #define LOG_TARGET (a valid FILE*)
56
+ // #include "log.h"
57
+ //
58
+ // or it can be simply redirected to stdout or stderr
59
+ // like so:
60
+ //
61
+ // #define LOG_TARGET stderr
62
+ // #include "log.h"
63
+ //
64
+ // The log target can also be redirected to a diffrent function
65
+ // like so:
66
+ //
67
+ // #define LOG_TARGET log_handler_diffrent()
68
+ // #include "log.h"
69
+ //
70
+ // FILE* log_handler_diffrent()
71
+ // {
72
+ // return stderr;
73
+ // }
74
+ //
75
+ // or:
76
+ //
77
+ // #define LOG_TARGET log_handler_another_one("somelog.log")
78
+ // #include "log.h"
79
+ //
80
+ // FILE* log_handler_another_one(char*filename)
81
+ // {
82
+ // static FILE* logfile = nullptr;
83
+ // (...)
84
+ // if( !logfile )
85
+ // {
86
+ // fopen(...)
87
+ // }
88
+ // (...)
89
+ // return logfile
90
+ // }
91
+ //
92
+ #ifndef LOG_TARGET
93
+ #define LOG_TARGET log_handler()
94
+ #endif
95
+
96
+ #ifndef LOG_TEE_TARGET
97
+ #define LOG_TEE_TARGET stderr
98
+ #endif
99
+
100
+ // Utility to obtain "pid" like unique process id and use it when creating log files.
101
+ inline std::string log_get_pid()
102
+ {
103
+ static std::string pid;
104
+ if (pid.empty())
105
+ {
106
+ // std::this_thread::get_id() is the most portable way of obtaining a "process id"
107
+ // it's not the same as "pid" but is unique enough to solve multiple instances
108
+ // trying to write to the same log.
109
+ std::stringstream ss;
110
+ ss << std::this_thread::get_id();
111
+ pid = ss.str();
112
+ }
113
+
114
+ return pid;
115
+ }
116
+
117
+ // Utility function for generating log file names with unique id based on thread id.
118
+ // invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
119
+ // where the number is a runtime id of the current thread.
120
+
121
+ #define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
122
+
123
+ // INTERNAL, DO NOT USE
124
+ inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
125
+ {
126
+ std::stringstream buf;
127
+
128
+ buf << log_file_basename;
129
+ buf << ".";
130
+ buf << log_get_pid();
131
+ buf << ".";
132
+ buf << log_file_extension;
133
+
134
+ return buf.str();
135
+ }
136
+
137
+ #ifndef LOG_DEFAULT_FILE_NAME
138
+ #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
139
+ #endif
140
+
141
+ // Utility for turning #define values into string literals
142
+ // so we can have a define for stderr and
143
+ // we can print "stderr" instead of literal stderr, etc.
144
+ #define LOG_STRINGIZE1(s) #s
145
+ #define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
146
+
147
+ #define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
148
+
149
+ // Allows disabling timestamps.
150
+ // in order to disable, define LOG_NO_TIMESTAMPS
151
+ // like so:
152
+ //
153
+ // #define LOG_NO_TIMESTAMPS
154
+ // #include "log.h"
155
+ //
156
+ #ifndef LOG_NO_TIMESTAMPS
157
+ #ifndef _MSC_VER
158
+ #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
159
+ #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
160
+ #else
161
+ #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
162
+ #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
163
+ #endif
164
+ #else
165
+ #define LOG_TIMESTAMP_FMT "%s"
166
+ #define LOG_TIMESTAMP_VAL ,""
167
+ #endif
168
+
169
+ #ifdef LOG_TEE_TIMESTAMPS
170
+ #ifndef _MSC_VER
171
+ #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
172
+ #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
173
+ #else
174
+ #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
175
+ #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
176
+ #endif
177
+ #else
178
+ #define LOG_TEE_TIMESTAMP_FMT "%s"
179
+ #define LOG_TEE_TIMESTAMP_VAL ,""
180
+ #endif
181
+
182
+ // Allows disabling file/line/function prefix
183
+ // in order to disable, define LOG_NO_FILE_LINE_FUNCTION
184
+ // like so:
185
+ //
186
+ // #define LOG_NO_FILE_LINE_FUNCTION
187
+ // #include "log.h"
188
+ //
189
+ #ifndef LOG_NO_FILE_LINE_FUNCTION
190
+ #ifndef _MSC_VER
191
+ #define LOG_FLF_FMT "[%24s:%5d][%24s] "
192
+ #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
193
+ #else
194
+ #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
195
+ #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
196
+ #endif
197
+ #else
198
+ #define LOG_FLF_FMT "%s"
199
+ #define LOG_FLF_VAL ,""
200
+ #endif
201
+
202
+ #ifdef LOG_TEE_FILE_LINE_FUNCTION
203
+ #ifndef _MSC_VER
204
+ #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
205
+ #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
206
+ #else
207
+ #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
208
+ #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
209
+ #endif
210
+ #else
211
+ #define LOG_TEE_FLF_FMT "%s"
212
+ #define LOG_TEE_FLF_VAL ,""
213
+ #endif
214
+
215
+ // Utility for synchronizing log configuration state
216
+ // since std::optional was introduced only in c++17
217
+ enum LogTriState
218
+ {
219
+ LogTriStateSame,
220
+ LogTriStateFalse,
221
+ LogTriStateTrue
222
+ };
223
+
224
+ // INTERNAL, DO NOT USE
225
+ // USE LOG() INSTEAD
226
+ //
227
+ #ifndef _MSC_VER
228
+ #define LOG_IMPL(str, ...) \
229
+ { \
230
+ if (LOG_TARGET != nullptr) \
231
+ { \
232
+ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
233
+ fflush(LOG_TARGET); \
234
+ } \
235
+ }
236
+ #else
237
+ #define LOG_IMPL(str, ...) \
238
+ { \
239
+ if (LOG_TARGET != nullptr) \
240
+ { \
241
+ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
242
+ fflush(LOG_TARGET); \
243
+ } \
244
+ }
245
+ #endif
246
+
247
+ // INTERNAL, DO NOT USE
248
+ // USE LOG_TEE() INSTEAD
249
+ //
250
+ #ifndef _MSC_VER
251
+ #define LOG_TEE_IMPL(str, ...) \
252
+ { \
253
+ if (LOG_TARGET != nullptr) \
254
+ { \
255
+ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
256
+ fflush(LOG_TARGET); \
257
+ } \
258
+ if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
259
+ { \
260
+ fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
261
+ fflush(LOG_TEE_TARGET); \
262
+ } \
263
+ }
264
+ #else
265
+ #define LOG_TEE_IMPL(str, ...) \
266
+ { \
267
+ if (LOG_TARGET != nullptr) \
268
+ { \
269
+ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
270
+ fflush(LOG_TARGET); \
271
+ } \
272
+ if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
273
+ { \
274
+ fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
275
+ fflush(LOG_TEE_TARGET); \
276
+ } \
277
+ }
278
+ #endif
279
+
280
+ // The '\0' as a last argument, is a trick to bypass the silly
281
+ // "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
282
+ // so we can have a single macro which can be called just like printf.
283
+
284
+ // Main LOG macro.
285
+ // behaves like printf, and supports arguments the exact same way.
286
+ //
287
+ #ifndef _MSC_VER
288
+ #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
289
+ #else
290
+ #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
291
+ #endif
292
+
293
+ // Main TEE macro.
294
+ // does the same as LOG
295
+ // and
296
+ // simultaneously writes stderr.
297
+ //
298
+ // Secondary target can be changed just like LOG_TARGET
299
+ // by defining LOG_TEE_TARGET
300
+ //
301
+ #ifndef _MSC_VER
302
+ #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
303
+ #else
304
+ #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
305
+ #endif
306
+
307
+ // LOG macro variants with auto endline.
308
+ #ifndef _MSC_VER
309
+ #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
310
+ #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
311
+ #else
312
+ #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
313
+ #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
314
+ #endif
315
+
316
+ // INTERNAL, DO NOT USE
317
+ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
318
+ {
319
+ static bool _initialized{false};
320
+ static bool _disabled{(filename.empty() && target == nullptr)};
321
+ static std::string log_current_filename{filename};
322
+ static FILE *log_current_target{target};
323
+ static FILE *logfile = nullptr;
324
+
325
+ if (change)
326
+ {
327
+ if (disable == LogTriStateTrue)
328
+ {
329
+ // Disable primary target
330
+ _disabled = true;
331
+ }
332
+ // If previously disabled, only enable, and keep previous target
333
+ else if (disable == LogTriStateFalse)
334
+ {
335
+ _disabled = false;
336
+ }
337
+ // Otherwise, process the arguments
338
+ else if (log_current_filename != filename || log_current_target != target)
339
+ {
340
+ _initialized = false;
341
+ }
342
+ }
343
+
344
+ if (_disabled)
345
+ {
346
+ // Log is disabled
347
+ return nullptr;
348
+ }
349
+
350
+ if (_initialized)
351
+ {
352
+ // with fallback in case something went wrong
353
+ return logfile ? logfile : stderr;
354
+ }
355
+
356
+ // do the (re)initialization
357
+ if (target != nullptr)
358
+ {
359
+ if (logfile != nullptr && logfile != stdout && logfile != stderr)
360
+ {
361
+ fclose(logfile);
362
+ }
363
+
364
+ log_current_filename = LOG_DEFAULT_FILE_NAME;
365
+ log_current_target = target;
366
+
367
+ logfile = target;
368
+ }
369
+ else
370
+ {
371
+ if (log_current_filename != filename)
372
+ {
373
+ if (logfile != nullptr && logfile != stdout && logfile != stderr)
374
+ {
375
+ fclose(logfile);
376
+ }
377
+ }
378
+
379
+ logfile = fopen(filename.c_str(), "w");
380
+ }
381
+
382
+ if (!logfile)
383
+ {
384
+ // Verify whether the file was opened, otherwise fallback to stderr
385
+ logfile = stderr;
386
+
387
+ fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
388
+ fflush(stderr);
389
+
390
+ // At this point we let the init flag be to true below, and let the target fallback to stderr
391
+ // otherwise we would repeatedly fopen() which was already unsuccessful
392
+ }
393
+
394
+ _initialized = true;
395
+
396
+ return logfile ? logfile : stderr;
397
+ }
398
+
399
+ // INTERNAL, DO NOT USE
400
+ inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
401
+ {
402
+ return log_handler1_impl(change, disable, filename, target);
403
+ }
404
+
405
+ // Disables logs entirely at runtime.
406
+ // Makes LOG() and LOG_TEE() produce no output,
407
+ // untill enabled back.
408
+ #define log_disable() log_disable_impl()
409
+
410
+ // INTERNAL, DO NOT USE
411
+ inline FILE *log_disable_impl()
412
+ {
413
+ return log_handler1_impl(true, LogTriStateTrue);
414
+ }
415
+
416
+ // Enables logs at runtime.
417
+ #define log_enable() log_enable_impl()
418
+
419
+ // INTERNAL, DO NOT USE
420
+ inline FILE *log_enable_impl()
421
+ {
422
+ return log_handler1_impl(true, LogTriStateFalse);
423
+ }
424
+
425
+ // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
426
+ #define log_set_target(target) log_set_target_impl(target)
427
+
428
+ // INTERNAL, DO NOT USE
429
+ inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
430
+ inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
431
+
432
+ // INTERNAL, DO NOT USE
433
+ inline FILE *log_handler() { return log_handler1_impl(); }
434
+
435
+ inline void log_test()
436
+ {
437
+ log_disable();
438
+ LOG("01 Hello World to nobody, because logs are disabled!\n")
439
+ log_enable();
440
+ LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
441
+ LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
442
+ log_set_target(stderr);
443
+ LOG("04 Hello World to stderr!\n")
444
+ LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
445
+ log_set_target(LOG_DEFAULT_FILE_NAME);
446
+ LOG("06 Hello World to default log file!\n")
447
+ log_set_target(stdout);
448
+ LOG("07 Hello World to stdout!\n")
449
+ log_set_target(LOG_DEFAULT_FILE_NAME);
450
+ LOG("08 Hello World to default log file again!\n")
451
+ log_disable();
452
+ LOG("09 Hello World _1_ into the void!\n")
453
+ log_enable();
454
+ LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
455
+ log_disable();
456
+ log_set_target("llama.anotherlog.log");
457
+ LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
458
+ log_enable();
459
+ LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
460
+ log_set_target("llama.yetanotherlog.log");
461
+ LOG("13 Hello World this time in yet new file?\n")
462
+ log_set_target(log_filename_generator("llama_autonamed", "log"));
463
+ LOG("14 Hello World in log with generated filename!\n")
464
+ #ifdef _MSC_VER
465
+ LOG_TEE("15 Hello msvc TEE without arguments\n")
466
+ LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
467
+ LOG_TEELN("17 Hello msvc TEELN without arguments\n")
468
+ LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
469
+ LOG("19 Hello msvc LOG without arguments\n")
470
+ LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
471
+ LOGLN("21 Hello msvc LOGLN without arguments\n")
472
+ LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
473
+ #endif
474
+ }
475
+
476
+ inline bool log_param_single_parse(const std::string & param)
477
+ {
478
+ if ( param == "--log-test")
479
+ {
480
+ log_test();
481
+ return true;
482
+ }
483
+
484
+ if ( param == "--log-disable")
485
+ {
486
+ log_disable();
487
+ return true;
488
+ }
489
+
490
+ if ( param == "--log-enable")
491
+ {
492
+ log_enable();
493
+ return true;
494
+ }
495
+
496
+ return false;
497
+ }
498
+
499
+ inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
500
+ {
501
+ if ( param == "--log-file")
502
+ {
503
+ if (!check_but_dont_parse)
504
+ {
505
+ log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
506
+ }
507
+
508
+ return true;
509
+ }
510
+
511
+ return false;
512
+ }
513
+
514
+ inline void log_print_usage()
515
+ {
516
+ printf("log options:\n");
517
+ /* format
518
+ printf(" -h, --help show this help message and exit\n");*/
519
+ /* spacing
520
+ printf("__-param----------------Description\n");*/
521
+ printf(" --log-test Run simple logging test\n");
522
+ printf(" --log-disable Disable trace logs\n");
523
+ printf(" --log-enable Enable trace logs\n");
524
+ printf(" --log-file Specify a log filename (without extension)\n");
525
+ printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
526
+ }
527
+
528
+ #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
529
+
530
+ // INTERNAL, DO NOT USE
531
+ inline void log_dump_cmdline_impl(int argc, char **argv)
532
+ {
533
+ std::stringstream buf;
534
+ for (int i = 0; i < argc; ++i)
535
+ {
536
+ if (std::string(argv[i]).find(' ') != std::string::npos)
537
+ {
538
+ buf << " \"" << argv[i] <<"\"";
539
+ }
540
+ else
541
+ {
542
+ buf << " " << argv[i];
543
+ }
544
+ }
545
+ LOGLN("Cmd:%s", buf.str().c_str())
546
+ }
547
+
548
+ #define log_tostr(var) log_var_to_string_impl(var).c_str()
549
+
550
+ inline std::string log_var_to_string_impl(bool var)
551
+ {
552
+ return var ? "true" : "false";
553
+ }
554
+
555
+ inline std::string log_var_to_string_impl(std::string var)
556
+ {
557
+ return var;
558
+ }
559
+
560
+ inline std::string log_var_to_string_impl(const std::vector<int> & var)
561
+ {
562
+ std::stringstream buf;
563
+ buf << "[ ";
564
+ bool first = true;
565
+ for (auto e : var)
566
+ {
567
+ if (first)
568
+ {
569
+ first = false;
570
+ }
571
+ else
572
+ {
573
+ buf << ", ";
574
+ }
575
+ buf << std::to_string(e);
576
+ }
577
+ buf << " ]";
578
+
579
+ return buf.str();
580
+ }
581
+
582
+ #define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \
583
+ [&tokens, &ctx]() \
584
+ { \
585
+ std::stringstream buf; \
586
+ buf << "[ "; \
587
+ \
588
+ bool first = true; \
589
+ for (const auto &token : tokens) \
590
+ { \
591
+ if (!first) \
592
+ buf << ", "; \
593
+ else \
594
+ first = false; \
595
+ \
596
+ auto detokenized = llama_token_to_piece(ctx, token); \
597
+ \
598
+ detokenized.erase( \
599
+ std::remove_if( \
600
+ detokenized.begin(), \
601
+ detokenized.end(), \
602
+ [](const unsigned char c) { return !std::isprint(c); }), \
603
+ detokenized.end()); \
604
+ \
605
+ buf \
606
+ << "'" << detokenized << "'" \
607
+ << ":" << std::to_string(token); \
608
+ } \
609
+ buf << " ]"; \
610
+ \
611
+ return buf.str(); \
612
+ }() \
613
+ .c_str()
614
+
615
+ #ifdef LOG_DISABLE_LOGS
616
+
617
+ #undef LOG
618
+ #define LOG(...) // dummy stub
619
+ #undef LOGLN
620
+ #define LOGLN(...) // dummy stub
621
+
622
+ #undef LOG_TEE
623
+ #define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
624
+
625
+ #undef LOG_TEELN
626
+ #define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
627
+
628
+ #undef LOG_DISABLE
629
+ #define LOG_DISABLE() // dummy stub
630
+
631
+ #undef LOG_ENABLE
632
+ #define LOG_ENABLE() // dummy stub
633
+
634
+ #undef LOG_ENABLE
635
+ #define LOG_ENABLE() // dummy stub
636
+
637
+ #undef LOG_SET_TARGET
638
+ #define LOG_SET_TARGET(...) // dummy stub
639
+
640
+ #undef LOG_DUMP_CMDLINE
641
+ #define LOG_DUMP_CMDLINE(...) // dummy stub
642
+
643
+ #endif // LOG_DISABLE_LOGS
convert-baichuan-hf-to-gguf.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF baichuan --> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING, Any
13
+ import itertools
14
+ import gguf
15
+ import numpy as np
16
+ import torch
17
+ from sentencepiece import SentencePieceProcessor # type: ignore[import]
18
+
19
+
20
+ if TYPE_CHECKING:
21
+ from typing import TypeAlias
22
+
23
+ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
24
+
25
+ # reverse HF permute back to original pth layout
26
+
27
+
28
+ def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
29
+ if n_kv_head is not None and n_head != n_kv_head:
30
+ n_head //= n_kv_head
31
+
32
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
33
+ .swapaxes(1, 2)
34
+ .reshape(weights.shape))
35
+
36
+ def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
37
+ r = weights.shape[0] // 3
38
+ return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
39
+
40
+ def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
41
+ r = weights.shape[0] // 3
42
+ return weights[r * n_part : r * n_part + r, ...]
43
+
44
+ def count_model_parts(dir_model: str) -> int:
45
+ num_parts = 0
46
+
47
+ for filename in os.listdir(dir_model):
48
+ if filename.startswith("pytorch_model-"):
49
+ num_parts += 1
50
+
51
+ if num_parts > 0:
52
+ print("gguf: found " + str(num_parts) + " model parts")
53
+
54
+ return num_parts
55
+
56
+
57
+
58
+ def parse_args() -> argparse.Namespace:
59
+ parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
60
+ parser.add_argument(
61
+ "--vocab-only", action="store_true",
62
+ help="extract only the vocab",
63
+ )
64
+ parser.add_argument(
65
+ "--outfile", type=Path,
66
+ help="path to write to; default: based on input",
67
+ )
68
+ parser.add_argument(
69
+ "model", type=Path,
70
+ help="directory containing model file, or model file itself (*.bin)",
71
+ )
72
+ parser.add_argument(
73
+ "ftype", type=int, choices=[0, 1], default=1, nargs='?',
74
+ help="output format - use 0 for float32, 1 for float16",
75
+ )
76
+ return parser.parse_args()
77
+
78
+ args = parse_args()
79
+
80
+ dir_model = args.model
81
+ ftype = args.ftype
82
+ if not dir_model.is_dir():
83
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
84
+ sys.exit(1)
85
+
86
+ # possible tensor data types
87
+ # ftype == 0 -> float32
88
+ # ftype == 1 -> float16
89
+
90
+ # map from ftype to string
91
+ ftype_str = ["f32", "f16"]
92
+
93
+ if args.outfile is not None:
94
+ fname_out = args.outfile
95
+ else:
96
+ # output in the same directory as the model by default
97
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
98
+
99
+ print("gguf: loading model "+dir_model.name)
100
+
101
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
102
+ hparams = json.load(f)
103
+ print("hello print: ",hparams["architectures"][0])
104
+ if hparams["architectures"][0] != "BaichuanForCausalLM":
105
+ print("Model architecture not supported: " + hparams["architectures"][0])
106
+
107
+ sys.exit()
108
+
109
+ # get number of model parts
110
+ num_parts = count_model_parts(dir_model)
111
+ print(f"num_parts:{num_parts}\n")
112
+ ARCH=gguf.MODEL_ARCH.BAICHUAN
113
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
114
+
115
+ print("gguf: get model metadata")
116
+
117
+ block_count = hparams["num_hidden_layers"]
118
+ head_count = hparams["num_attention_heads"]
119
+
120
+ if "num_key_value_heads" in hparams:
121
+ head_count_kv = hparams["num_key_value_heads"]
122
+ else:
123
+ head_count_kv = head_count
124
+
125
+ if "_name_or_path" in hparams:
126
+ hf_repo = hparams["_name_or_path"]
127
+ else:
128
+ hf_repo = ""
129
+
130
+ if "max_sequence_length" in hparams:
131
+ ctx_length = hparams["max_sequence_length"]
132
+ elif "max_position_embeddings" in hparams:
133
+ ctx_length = hparams["max_position_embeddings"]
134
+ elif "model_max_length" in hparams:
135
+ ctx_length = hparams["model_max_length"]
136
+ else:
137
+ print("gguf: can not find ctx length parameter.")
138
+
139
+ sys.exit()
140
+
141
+
142
+ gguf_writer.add_name(dir_model.name)
143
+ gguf_writer.add_source_hf_repo(hf_repo)
144
+ gguf_writer.add_tensor_data_layout("Meta AI original pth")
145
+ gguf_writer.add_context_length(ctx_length)
146
+ gguf_writer.add_embedding_length(hparams["hidden_size"])
147
+ gguf_writer.add_block_count(block_count)
148
+ gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
149
+ gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
150
+ gguf_writer.add_head_count(head_count)
151
+ gguf_writer.add_head_count_kv(head_count_kv)
152
+ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
153
+
154
+ if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
155
+ if "type" in hparams["rope_scaling"]:
156
+ if hparams["rope_scaling"]["type"] == "linear":
157
+ gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
158
+
159
+
160
+ # TOKENIZATION
161
+
162
+ print("gguf: get tokenizer metadata")
163
+
164
+ tokens: list[bytes] = []
165
+ scores: list[float] = []
166
+ toktypes: list[int] = []
167
+
168
+ tokenizer_model_file = dir_model / 'tokenizer.model'
169
+ if not tokenizer_model_file.is_file():
170
+ print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
171
+ sys.exit(1)
172
+
173
+ # vocab type sentencepiece
174
+ print("gguf: get sentencepiece tokenizer vocab, scores and token types")
175
+
176
+ tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
177
+
178
+ for i in range(tokenizer.vocab_size()):
179
+ text: bytes
180
+ score: float
181
+
182
+ piece = tokenizer.id_to_piece(i)
183
+ text = piece.encode("utf-8")
184
+ score = tokenizer.get_score(i)
185
+
186
+ toktype = 1 # defualt to normal token type
187
+ if tokenizer.is_unknown(i):
188
+ toktype = 2
189
+ if tokenizer.is_control(i):
190
+ toktype = 3
191
+
192
+ # toktype = 4 is user-defined = tokens from added_tokens.json
193
+
194
+ if tokenizer.is_unused(i):
195
+ toktype = 5
196
+ if tokenizer.is_byte(i):
197
+ toktype = 6
198
+
199
+ tokens.append(text)
200
+ scores.append(score)
201
+ toktypes.append(toktype)
202
+
203
+ added_tokens_file = dir_model / 'added_tokens.json'
204
+ if added_tokens_file.is_file():
205
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
206
+ addtokens_json = json.load(f)
207
+
208
+ print("gguf: get added tokens")
209
+
210
+ for key in addtokens_json:
211
+ tokens.append( key.encode("utf-8") )
212
+ scores.append(-1000.0)
213
+ toktypes.append(4) # user-defined token type
214
+
215
+
216
+ gguf_writer.add_tokenizer_model("llama")
217
+ gguf_writer.add_token_list(tokens)
218
+ gguf_writer.add_token_scores(scores)
219
+ gguf_writer.add_token_types(toktypes)
220
+
221
+ special_vocab = gguf.SpecialVocab(dir_model)
222
+ special_vocab.add_to_gguf(gguf_writer)
223
+
224
+ # TENSORS
225
+
226
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
227
+
228
+ # tensor info
229
+ print("gguf: get tensor metadata")
230
+
231
+ if num_parts == 0:
232
+ part_names = iter(("pytorch_model.bin",))
233
+ else:
234
+ part_names = (
235
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
236
+ )
237
+
238
+
239
+ for part_name in part_names:
240
+ if args.vocab_only:
241
+ break
242
+ print("gguf: loading model part '" + part_name + "'")
243
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
244
+
245
+ tmp=model_part
246
+ for i in range(block_count):
247
+ if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
248
+ print(f"Unpacking and permuting layer {i}")
249
+ tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
250
+ tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
251
+ tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
252
+ del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
253
+
254
+ for name in model_part.keys():
255
+ data = model_part[name]
256
+ # we don't need these
257
+ if name.endswith(".rotary_emb.inv_freq"):
258
+ continue
259
+
260
+ old_dtype = data.dtype
261
+
262
+ # convert any unsupported data types to float32
263
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
264
+ data = data.to(torch.float32)
265
+
266
+ data = data.squeeze().numpy()
267
+
268
+ # map tensor names
269
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
270
+ if new_name is None:
271
+ print("Can not map tensor '" + name + "'")
272
+ sys.exit()
273
+
274
+ n_dims = len(data.shape)
275
+ data_dtype = data.dtype
276
+
277
+ # if f32 desired, convert any float16 to float32
278
+ if ftype == 0 and data_dtype == np.float16:
279
+ data = data.astype(np.float32)
280
+
281
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
282
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
283
+ data = data.astype(np.float32)
284
+
285
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
286
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
287
+ data = data.astype(np.float16)
288
+
289
+ print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
290
+ gguf_writer.add_tensor(new_name, data)
291
+
292
+
293
+ print("gguf: write header")
294
+ gguf_writer.write_header_to_file()
295
+ print("gguf: write metadata")
296
+ gguf_writer.write_kv_data_to_file()
297
+ if not args.vocab_only:
298
+ print("gguf: write tensors")
299
+ gguf_writer.write_tensors_to_file()
300
+
301
+ gguf_writer.close()
302
+
303
+ print(f"gguf: model successfully exported to '{fname_out}'")
304
+ print("")
convert-falcon-hf-to-gguf.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF falcon--> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import torch
16
+ from transformers import AutoTokenizer # type: ignore[import]
17
+
18
+ if 'NO_LOCAL_GGUF' not in os.environ:
19
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20
+ import gguf
21
+
22
+
23
+ def bytes_to_unicode():
24
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
25
+ """
26
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
27
+ The reversible bpe codes work on unicode strings.
28
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
29
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
30
+ This is a significant percentage of your normal, say, 32K bpe vocab.
31
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
32
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
33
+ """
34
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
35
+ cs = bs[:]
36
+ n = 0
37
+ for b in range(2**8):
38
+ if b not in bs:
39
+ bs.append(b)
40
+ cs.append(2**8+n)
41
+ n += 1
42
+ return dict(zip(bs, (chr(n) for n in cs)))
43
+
44
+
45
+ def count_model_parts(dir_model: Path) -> int:
46
+ num_parts = 0
47
+ for filename in os.listdir(dir_model):
48
+ if filename.startswith("pytorch_model-"):
49
+ num_parts += 1
50
+
51
+ if num_parts > 0:
52
+ print("gguf: found " + str(num_parts) + " model parts")
53
+ return num_parts
54
+
55
+
56
+ def parse_args() -> argparse.Namespace:
57
+ parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
58
+ parser.add_argument(
59
+ "--vocab-only", action="store_true",
60
+ help="extract only the vocab",
61
+ )
62
+ parser.add_argument(
63
+ "--outfile", type=Path,
64
+ help="path to write to; default: based on input",
65
+ )
66
+ parser.add_argument(
67
+ "model", type=Path,
68
+ help="directory containing model file, or model file itself (*.bin)",
69
+ )
70
+ parser.add_argument(
71
+ "ftype", type=int, choices=[0, 1], default=1, nargs='?',
72
+ help="output format - use 0 for float32, 1 for float16",
73
+ )
74
+ return parser.parse_args()
75
+
76
+ args = parse_args()
77
+
78
+ dir_model = args.model
79
+ ftype = args.ftype
80
+ if not dir_model.is_dir():
81
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
82
+ sys.exit(1)
83
+
84
+ # possible tensor data types
85
+ # ftype == 0 -> float32
86
+ # ftype == 1 -> float16
87
+
88
+ # map from ftype to string
89
+ ftype_str = ["f32", "f16"]
90
+
91
+ if args.outfile is not None:
92
+ fname_out = args.outfile
93
+ else:
94
+ # output in the same directory as the model by default
95
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
96
+
97
+ print("gguf: loading model "+dir_model.name)
98
+
99
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
100
+ hparams = json.load(f)
101
+
102
+ if hparams["architectures"][0] != "RWForCausalLM":
103
+ print("Model architecture not supported: " + hparams["architectures"][0])
104
+
105
+ sys.exit(1)
106
+
107
+ # get number of model parts
108
+ num_parts = count_model_parts(dir_model)
109
+
110
+ ARCH=gguf.MODEL_ARCH.FALCON
111
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
112
+
113
+ print("gguf: get model metadata")
114
+
115
+ block_count = hparams["n_layer"]
116
+
117
+ gguf_writer.add_name("Falcon")
118
+ gguf_writer.add_context_length(2048) # not in config.json
119
+ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
120
+ gguf_writer.add_embedding_length(hparams["hidden_size"])
121
+ gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
122
+ gguf_writer.add_block_count(block_count)
123
+ gguf_writer.add_head_count(hparams["n_head"])
124
+ if "n_head_kv" in hparams:
125
+ gguf_writer.add_head_count_kv(hparams["n_head_kv"])
126
+ else:
127
+ gguf_writer.add_head_count_kv(1)
128
+ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
129
+ gguf_writer.add_file_type(ftype)
130
+
131
+ # TOKENIZATION
132
+
133
+ print("gguf: get tokenizer metadata")
134
+
135
+ tokens: list[bytearray] = []
136
+ scores: list[float] = []
137
+ toktypes: list[int] = []
138
+
139
+ tokenizer_json_file = dir_model / 'tokenizer.json'
140
+ if not tokenizer_json_file.is_file():
141
+ print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
142
+ sys.exit(1)
143
+
144
+ # gpt2 tokenizer
145
+ gguf_writer.add_tokenizer_model("gpt2")
146
+
147
+ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
148
+ tokenizer_json = json.load(f)
149
+
150
+ print("gguf: get gpt2 tokenizer vocab")
151
+
152
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
153
+ # This causes downstream issues with mismatched tensor sizes when running the inference
154
+ vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
155
+
156
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
157
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
158
+
159
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
160
+ byte_encoder = bytes_to_unicode()
161
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
162
+
163
+ for i in range(vocab_size):
164
+ if i in reverse_vocab:
165
+ try:
166
+ text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
167
+ except KeyError:
168
+ text = bytearray()
169
+ for c in reverse_vocab[i]:
170
+ if ord(c) < 256: # single byte character
171
+ text.append(byte_decoder[ord(c)])
172
+ else: # multibyte special token character
173
+ text.extend(c.encode('utf-8'))
174
+ else:
175
+ print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
176
+ pad_token = f"[PAD{i}]".encode("utf8")
177
+ text = bytearray(pad_token)
178
+
179
+ tokens.append(text)
180
+ scores.append(0.0) # dymmy
181
+ toktypes.append(gguf.TokenType.NORMAL) # dummy
182
+
183
+ gguf_writer.add_token_list(tokens)
184
+ gguf_writer.add_token_scores(scores)
185
+ gguf_writer.add_token_types(toktypes)
186
+
187
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
188
+ special_vocab.add_to_gguf(gguf_writer)
189
+
190
+ # TENSORS
191
+
192
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
193
+
194
+ # params for qkv transform
195
+ n_head = hparams["n_head"]
196
+ n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
197
+
198
+ head_dim = hparams["hidden_size"] // n_head
199
+
200
+ # tensor info
201
+ print("gguf: get tensor metadata")
202
+
203
+ if num_parts == 0:
204
+ part_names = iter(("pytorch_model.bin",))
205
+ else:
206
+ part_names = (
207
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
208
+ )
209
+
210
+ for part_name in part_names:
211
+ if args.vocab_only:
212
+ break
213
+ print("gguf: loading model part '" + part_name + "'")
214
+ model_part = torch.load(dir_model / part_name, map_location="cpu")
215
+
216
+ for name in model_part.keys():
217
+ data = model_part[name]
218
+
219
+ old_dtype = data.dtype
220
+
221
+ # convert any unsupported data types to float32
222
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
223
+ data = data.to(torch.float32)
224
+
225
+ # QKV tensor transform
226
+ # The original query_key_value tensor contains n_head_kv "kv groups",
227
+ # each consisting of n_head/n_head_kv query weights followed by one key
228
+ # and one value weight (shared by all query heads in the kv group).
229
+ # This layout makes it a big pain to work with in GGML.
230
+ # So we rearrange them here,, so that we have n_head query weights
231
+ # followed by n_head_kv key weights followed by n_head_kv value weights,
232
+ # in contiguous fashion.
233
+ # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
234
+
235
+ if "query_key_value" in name:
236
+ qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
237
+ q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
238
+ k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
239
+ v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
240
+ data = torch.cat((q,k,v)).reshape_as(data)
241
+
242
+ data = data.squeeze().numpy()
243
+
244
+ # map tensor names
245
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
246
+ if new_name is None:
247
+ print("Can not map tensor '" + name + "'")
248
+ sys.exit()
249
+
250
+ n_dims = len(data.shape)
251
+ data_dtype = data.dtype
252
+
253
+ # if f32 desired, convert any float16 to float32
254
+ if ftype == 0 and data_dtype == np.float16:
255
+ data = data.astype(np.float32)
256
+
257
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
258
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
259
+ data = data.astype(np.float32)
260
+
261
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
262
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
263
+ data = data.astype(np.float16)
264
+
265
+ print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
266
+
267
+ gguf_writer.add_tensor(new_name, data)
268
+
269
+
270
+ print("gguf: write header")
271
+ gguf_writer.write_header_to_file()
272
+ print("gguf: write metadata")
273
+ gguf_writer.write_kv_data_to_file()
274
+ if not args.vocab_only:
275
+ print("gguf: write tensors")
276
+ gguf_writer.write_tensors_to_file()
277
+
278
+ gguf_writer.close()
279
+
280
+ print(f"gguf: model successfully exported to '{fname_out}'")
281
+ print("")
convert-gptneox-hf-to-gguf.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF gptneox--> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import torch
16
+ from transformers import AutoTokenizer # type: ignore[import]
17
+
18
+ if 'NO_LOCAL_GGUF' not in os.environ:
19
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20
+ import gguf
21
+
22
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
23
+
24
+
25
+ def bytes_to_unicode():
26
+ """
27
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
28
+ The reversible bpe codes work on unicode strings.
29
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
30
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
31
+ This is a significant percentage of your normal, say, 32K bpe vocab.
32
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
33
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
34
+ """
35
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
36
+ cs = bs[:]
37
+ n = 0
38
+ for b in range(2**8):
39
+ if b not in bs:
40
+ bs.append(b)
41
+ cs.append(2**8+n)
42
+ n += 1
43
+ return dict(zip(bs, (chr(n) for n in cs)))
44
+
45
+
46
+ def count_model_parts(dir_model: Path) -> int:
47
+ num_parts = 0
48
+ for filename in os.listdir(dir_model):
49
+ if filename.startswith("pytorch_model-"):
50
+ num_parts += 1
51
+
52
+ if num_parts > 0:
53
+ print("gguf: found " + str(num_parts) + " model parts")
54
+ return num_parts
55
+
56
+
57
+ def parse_args() -> argparse.Namespace:
58
+ parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
59
+ parser.add_argument(
60
+ "--vocab-only", action="store_true",
61
+ help="extract only the vocab",
62
+ )
63
+ parser.add_argument(
64
+ "--outfile", type=Path,
65
+ help="path to write to; default: based on input",
66
+ )
67
+ parser.add_argument(
68
+ "model", type=Path,
69
+ help="directory containing model file, or model file itself (*.bin)",
70
+ )
71
+ parser.add_argument(
72
+ "ftype", type=int, choices=[0, 1], default=1, nargs='?',
73
+ help="output format - use 0 for float32, 1 for float16",
74
+ )
75
+ return parser.parse_args()
76
+
77
+ args = parse_args()
78
+
79
+ dir_model = args.model
80
+ ftype = args.ftype
81
+ if not dir_model.is_dir():
82
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
83
+ sys.exit(1)
84
+
85
+ # possible tensor data types
86
+ # ftype == 0 -> float32
87
+ # ftype == 1 -> float16
88
+
89
+ # map from ftype to string
90
+ ftype_str = ["f32", "f16"]
91
+
92
+ if args.outfile is not None:
93
+ fname_out = args.outfile
94
+ else:
95
+ # output in the same directory as the model by default
96
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
97
+
98
+ print("gguf: loading model "+dir_model.name)
99
+
100
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
101
+ hparams = json.load(f)
102
+
103
+ if hparams["architectures"][0] != "GPTNeoXForCausalLM":
104
+ print("Model architecture not supported: " + hparams["architectures"][0])
105
+
106
+ sys.exit()
107
+
108
+ # get number of model parts
109
+ num_parts = count_model_parts(dir_model)
110
+
111
+ ARCH=gguf.MODEL_ARCH.GPTNEOX
112
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
113
+
114
+ print("gguf: get model metadata")
115
+
116
+ block_count = hparams["num_hidden_layers"]
117
+
118
+ gguf_writer.add_name(dir_model.name)
119
+ gguf_writer.add_context_length(hparams["max_position_embeddings"])
120
+ gguf_writer.add_embedding_length(hparams["hidden_size"])
121
+ gguf_writer.add_block_count(block_count)
122
+ gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
123
+ gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
124
+ gguf_writer.add_head_count(hparams["num_attention_heads"])
125
+ gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
126
+ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
127
+
128
+ # TOKENIZATION
129
+
130
+ print("gguf: get tokenizer metadata")
131
+
132
+ tokens: list[bytearray] = []
133
+
134
+ tokenizer_json_file = dir_model / 'tokenizer.json'
135
+ if not tokenizer_json_file.is_file():
136
+ print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
137
+ sys.exit(1)
138
+
139
+ # gpt2 tokenizer
140
+ gguf_writer.add_tokenizer_model("gpt2")
141
+
142
+ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
143
+ tokenizer_json = json.load(f)
144
+
145
+ print("gguf: get gpt2 tokenizer vocab")
146
+
147
+ vocab_size = len(tokenizer_json["model"]["vocab"])
148
+
149
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
150
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
151
+
152
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
153
+ byte_encoder = bytes_to_unicode()
154
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
155
+
156
+ for i in range(vocab_size):
157
+ if i in reverse_vocab:
158
+ try:
159
+ text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
160
+ except KeyError:
161
+ text = bytearray()
162
+ for c in reverse_vocab[i]:
163
+ if ord(c) < 256: # single byte character
164
+ text.append(byte_decoder[ord(c)])
165
+ else: # multibyte special token character
166
+ text.extend(c.encode('utf-8'))
167
+ else:
168
+ print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
169
+ pad_token = f"[PAD{i}]".encode("utf8")
170
+ text = bytearray(pad_token)
171
+
172
+ tokens.append(text)
173
+
174
+ gguf_writer.add_token_list(tokens)
175
+
176
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
177
+ special_vocab.add_to_gguf(gguf_writer)
178
+
179
+ # TENSORS
180
+
181
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
182
+
183
+ # tensor info
184
+ print("gguf: get tensor metadata")
185
+
186
+ if num_parts == 0:
187
+ part_names = iter(("pytorch_model.bin",))
188
+ else:
189
+ part_names = (
190
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
191
+ )
192
+
193
+ for part_name in part_names:
194
+ if args.vocab_only:
195
+ break
196
+ print("gguf: loading model part '" + part_name + "'")
197
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
198
+
199
+ for name in model_part.keys():
200
+ data = model_part[name]
201
+
202
+ # we don't need these
203
+ if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
204
+ continue
205
+
206
+ old_dtype = data.dtype
207
+
208
+ # convert any unsupported data types to float32
209
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
210
+ data = data.to(torch.float32)
211
+
212
+ data = data.squeeze().numpy()
213
+
214
+ # map tensor names
215
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
216
+ if new_name is None:
217
+ print("Can not map tensor '" + name + "'")
218
+ sys.exit()
219
+
220
+ n_dims = len(data.shape)
221
+ data_dtype = data.dtype
222
+
223
+ # if f32 desired, convert any float16 to float32
224
+ if ftype == 0 and data_dtype == np.float16:
225
+ data = data.astype(np.float32)
226
+
227
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
228
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
229
+ data = data.astype(np.float32)
230
+
231
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
232
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
233
+ data = data.astype(np.float16)
234
+
235
+ print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
236
+
237
+ gguf_writer.add_tensor(new_name, data)
238
+
239
+
240
+ print("gguf: write header")
241
+ gguf_writer.write_header_to_file()
242
+ print("gguf: write metadata")
243
+ gguf_writer.write_kv_data_to_file()
244
+ if not args.vocab_only:
245
+ print("gguf: write tensors")
246
+ gguf_writer.write_tensors_to_file()
247
+
248
+ gguf_writer.close()
249
+
250
+ print(f"gguf: model successfully exported to '{fname_out}'")
251
+ print("")
convert-llama-ggml-to-gguf.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import math
6
+ import struct
7
+ import sys
8
+ from enum import IntEnum
9
+ from pathlib import Path
10
+
11
+ import numpy as np
12
+
13
+ import os
14
+ if 'NO_LOCAL_GGUF' not in os.environ:
15
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
16
+ import gguf
17
+
18
+ # Note: Does not support GGML_QKK_64
19
+ QK_K = 256
20
+ # Items here are (block size, type size)
21
+ GGML_QUANT_SIZES = {
22
+ gguf.GGMLQuantizationType.F32 : (1, 4),
23
+ gguf.GGMLQuantizationType.F16 : (1, 2),
24
+ gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
25
+ gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
26
+ gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
27
+ gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
28
+ gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
29
+ gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
30
+ gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
31
+ gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
32
+ gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
33
+ gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
34
+ gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
35
+ gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
36
+ }
37
+
38
+ class GGMLFormat(IntEnum):
39
+ GGML = 0
40
+ GGMF = 1
41
+ GGJT = 2
42
+
43
+ class GGMLFType(IntEnum):
44
+ ALL_F32 = 0
45
+ MOSTLY_F16 = 1
46
+ MOSTLY_Q4_0 = 2
47
+ MOSTLY_Q4_1 = 3
48
+ MOSTLY_Q4_1_SOME_F16 = 4
49
+ MOSTLY_Q8_0 = 7
50
+ MOSTLY_Q5_0 = 8
51
+ MOSTLY_Q5_1 = 9
52
+ MOSTLY_Q2_K = 10
53
+ MOSTLY_Q3_K_S = 11
54
+ MOSTLY_Q3_K_M = 12
55
+ MOSTLY_Q3_K_L = 13
56
+ MOSTLY_Q4_K_S = 14
57
+ MOSTLY_Q4_K_M = 15
58
+ MOSTLY_Q5_K_S = 16
59
+ MOSTLY_Q5_K_M = 17
60
+ MOSTLY_Q6_K = 18
61
+
62
+ class Hyperparameters:
63
+ def __init__(self):
64
+ self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
65
+ self.n_layer = self.n_rot = self.n_ff = 0
66
+ self.ftype = GGMLFType.ALL_F32
67
+
68
+ def set_n_ff(self, model):
69
+ ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
70
+ assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
71
+ ff_tensor = model.tensors[ff_tensor_idx]
72
+ self.n_ff = ff_tensor.dims[1]
73
+
74
+ def load(self, data, offset):
75
+ (
76
+ self.n_vocab,
77
+ self.n_embd,
78
+ self.n_mult,
79
+ self.n_head,
80
+ self.n_layer,
81
+ self.n_rot,
82
+ ftype,
83
+ ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
84
+ try:
85
+ self.ftype = GGMLFType(ftype)
86
+ except ValueError:
87
+ raise ValueError(f'Invalid ftype {ftype}')
88
+ return 4 * 7
89
+
90
+ def __str__(self):
91
+ return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
92
+
93
+ class Vocab:
94
+ def __init__(self, load_scores = True):
95
+ self.items = []
96
+ self.load_scores = load_scores
97
+
98
+ def load(self, data, offset, n_vocab):
99
+ orig_offset = offset
100
+ for _ in range(n_vocab):
101
+ itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
102
+ assert itemlen < 4096, 'Absurd vocab item length'
103
+ offset += 4
104
+ item_text = bytes(data[offset:offset + itemlen])
105
+ offset += itemlen
106
+ if self.load_scores:
107
+ item_score = struct.unpack('<f', data[offset:offset + 4])[0]
108
+ offset += 4
109
+ else:
110
+ item_score = 0.0
111
+ self.items.append((item_text, item_score))
112
+ return offset - orig_offset
113
+
114
+ class Tensor:
115
+ def __init__(self, use_padding = True):
116
+ self.name = None
117
+ self.dims: tuple[int, ...] = ()
118
+ self.dtype = None
119
+ self.start_offset = 0
120
+ self.len_bytes = np.int64(0)
121
+ self.use_padding = use_padding
122
+
123
+ def load(self, data, offset):
124
+ orig_offset = offset
125
+ (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
126
+ assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
127
+ assert name_len < 4096, 'Absurd tensor name length'
128
+ quant = GGML_QUANT_SIZES.get(dtype)
129
+ assert quant is not None, 'Unknown tensor type'
130
+ (blksize, tysize) = quant
131
+ offset += 12
132
+ self.dtype= dtype
133
+ self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
134
+ offset += 4 * n_dims
135
+ self.name = bytes(data[offset:offset + name_len])
136
+ offset += name_len
137
+ pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
138
+ offset += pad
139
+ n_elems = np.prod(self.dims)
140
+ n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
141
+ self.start_offset = offset
142
+ self.len_bytes = n_bytes
143
+ offset += n_bytes
144
+ # print(n_dims, name_len, dtype, self.dims, self.name, pad)
145
+ return offset - orig_offset
146
+
147
+ class GGMLModel:
148
+ def __init__(self):
149
+ self.hyperparameters = None
150
+ self.vocab = None
151
+ self.tensor_map = {}
152
+ self.tensors = []
153
+
154
+ def validate_header(self, data, offset):
155
+ magic = bytes(data[offset:offset + 4])
156
+ if magic == b'GGUF':
157
+ raise ValueError('File is already in GGUF format.')
158
+ if magic == b'lmgg':
159
+ self.file_format = GGMLFormat.GGML
160
+ self.format_version = 1
161
+ return 4
162
+ version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
163
+ if magic == b'fmgg':
164
+ if version != 1:
165
+ raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
166
+ self.file_format = GGMLFormat.GGMF
167
+ self.format_version = version
168
+ return 8
169
+ if magic == b'tjgg':
170
+ if version < 1 or version > 3:
171
+ raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
172
+ self.file_format = GGMLFormat.GGJT
173
+ self.format_version = version
174
+ return 8
175
+ raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
176
+
177
+ def validate_conversion(self, ftype):
178
+ err = ''
179
+ if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
180
+ if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
181
+ err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
182
+ elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
183
+ if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
184
+ GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
185
+ err = 'Q4 and Q8 quantizations changed in GGJTv3.'
186
+ if len(err) > 0:
187
+ raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
188
+
189
+ def load(self, data, offset):
190
+ offset += self.validate_header(data, offset)
191
+ hp = Hyperparameters()
192
+ offset += hp.load(data, offset)
193
+ print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
194
+ self.validate_conversion(hp.ftype)
195
+ vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
196
+ offset += vocab.load(data, offset, hp.n_vocab)
197
+ tensors: list[Tensor] = []
198
+ tensor_map = {}
199
+ while offset < len(data):
200
+ tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
201
+ offset += tensor.load(data, offset)
202
+ tensor_map[tensor.name] = len(tensors)
203
+ tensors.append(tensor)
204
+ self.hyperparameters = hp
205
+ self.vocab = vocab
206
+ self.tensors = tensors
207
+ self.tensor_map = tensor_map
208
+ hp.set_n_ff(self)
209
+ return offset
210
+
211
+ class GGMLToGGUF:
212
+ def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
213
+ hp = ggml_model.hyperparameters
214
+ self.model = ggml_model
215
+ self.data = data
216
+ self.cfg = cfg
217
+ self.params_override = params_override
218
+ self.vocab_override = vocab_override
219
+ self.special_vocab = special_vocab
220
+ if params_override is not None:
221
+ n_kv_head = params_override.n_head_kv
222
+ else:
223
+ if cfg.gqa == 1:
224
+ n_kv_head = hp.n_head
225
+ else:
226
+ gqa = float(cfg.gqa)
227
+ n_kv_head = None
228
+ for x in range(1, 256):
229
+ if float(hp.n_head) / float(x) == gqa:
230
+ n_kv_head = x
231
+ assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
232
+ print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
233
+ self.n_kv_head = n_kv_head
234
+ self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
235
+
236
+ def save(self):
237
+ print('* Preparing to save GGUF file')
238
+ gguf_writer = gguf.GGUFWriter(
239
+ self.cfg.output,
240
+ gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
241
+ use_temp_file = False )
242
+ self.add_params(gguf_writer)
243
+ self.add_vocab(gguf_writer)
244
+ if self.special_vocab is not None:
245
+ self.special_vocab.add_to_gguf(gguf_writer)
246
+ self.add_tensors(gguf_writer)
247
+ print(" gguf: write header")
248
+ gguf_writer.write_header_to_file()
249
+ print(" gguf: write metadata")
250
+ gguf_writer.write_kv_data_to_file()
251
+ print(" gguf: write tensors")
252
+ gguf_writer.write_tensors_to_file()
253
+ gguf_writer.close()
254
+
255
+ def add_params(self, gguf_writer):
256
+ hp = self.model.hyperparameters
257
+ cfg = self.cfg
258
+ if cfg.desc is not None:
259
+ desc = cfg.desc
260
+ else:
261
+ desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
262
+ try:
263
+ # Filenames aren't necessarily valid UTF8.
264
+ name = cfg.name if cfg.name is not None else cfg.input.name
265
+ except UnicodeDecodeError:
266
+ name = None
267
+ print('* Adding model parameters and KV items')
268
+ if name is not None:
269
+ gguf_writer.add_name(name)
270
+ gguf_writer.add_description(desc)
271
+ gguf_writer.add_file_type(int(hp.ftype))
272
+ if self.params_override is not None:
273
+ po = self.params_override
274
+ assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
275
+ assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
276
+ assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
277
+ gguf_writer.add_context_length (po.n_ctx)
278
+ gguf_writer.add_embedding_length (po.n_embd)
279
+ gguf_writer.add_block_count (po.n_layer)
280
+ gguf_writer.add_feed_forward_length (po.n_ff)
281
+ gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
282
+ gguf_writer.add_head_count (po.n_head)
283
+ gguf_writer.add_head_count_kv (po.n_head_kv)
284
+ gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
285
+ return
286
+ gguf_writer.add_context_length(cfg.context_length)
287
+ gguf_writer.add_embedding_length(hp.n_embd)
288
+ gguf_writer.add_block_count(hp.n_layer)
289
+ gguf_writer.add_feed_forward_length(hp.n_ff)
290
+ gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
291
+ gguf_writer.add_head_count(hp.n_head)
292
+ gguf_writer.add_head_count_kv(self.n_kv_head)
293
+ gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
294
+
295
+ def add_vocab(self, gguf_writer):
296
+ hp = self.model.hyperparameters
297
+ gguf_writer.add_tokenizer_model('llama')
298
+ tokens = []
299
+ scores = []
300
+ toktypes = []
301
+ if self.vocab_override is not None:
302
+ vo = self.vocab_override
303
+ print('* Adding vocab item(s)')
304
+ for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
305
+ tokens.append(vbytes)
306
+ scores.append(score)
307
+ toktypes.append(ttype)
308
+ assert len(tokens) == hp.n_vocab, \
309
+ f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
310
+ gguf_writer.add_token_list(tokens)
311
+ gguf_writer.add_token_scores(scores)
312
+ if len(toktypes) > 0:
313
+ gguf_writer.add_token_types(toktypes)
314
+ return
315
+ print(f'* Adding {hp.n_vocab} vocab item(s)')
316
+ assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
317
+ for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
318
+ tt = 1 # Normal
319
+ # Special handling for UNK, BOS, EOS tokens.
320
+ if tokid <= 2:
321
+ if tokid == 0:
322
+ vbytes = b'<unk>'
323
+ tt = 2
324
+ elif tokid == 1:
325
+ vbytes = b'<s>'
326
+ tt = 3
327
+ else:
328
+ vbytes = b'</s>'
329
+ tt = 3
330
+ elif len(vbytes) == 0:
331
+ tt = 3 # Control
332
+ elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
333
+ vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
334
+ tt = 6 # Byte
335
+ else:
336
+ vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
337
+ toktypes.append(tt)
338
+ tokens.append(vbytes)
339
+ scores.append(vscore)
340
+ gguf_writer.add_token_list(tokens)
341
+ gguf_writer.add_token_scores(scores)
342
+ gguf_writer.add_token_types(toktypes)
343
+ gguf_writer.add_unk_token_id(0)
344
+ gguf_writer.add_bos_token_id(1)
345
+ gguf_writer.add_eos_token_id(2)
346
+
347
+ def add_tensors(self, gguf_writer):
348
+ tensor_map = self.name_map
349
+ data = self.data
350
+ print(f'* Adding {len(self.model.tensors)} tensor(s)')
351
+ for tensor in self.model.tensors:
352
+ name = str(tensor.name, 'UTF-8')
353
+ mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
354
+ assert mapped_name is not None, f'Bad name {name}'
355
+ tempdims = list(tensor.dims[:])
356
+ if len(tempdims) > 1:
357
+ temp = tempdims[1]
358
+ tempdims[1] = tempdims[0]
359
+ tempdims[0] = temp
360
+ # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
361
+ gguf_writer.add_tensor(
362
+ mapped_name,
363
+ data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
364
+ raw_shape = tempdims,
365
+ raw_dtype = tensor.dtype )
366
+
367
+ def handle_metadata(cfg, hp):
368
+ import convert
369
+ assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
370
+ hf_config_path = cfg.model_metadata_dir / "config.json"
371
+ orig_config_path = cfg.model_metadata_dir / "params.json"
372
+ # We pass a fake model here. "original" mode will check the shapes of some
373
+ # tensors if information is missing in the .json file: other than that, the
374
+ # model data isn't used so this should be safe (at least for now).
375
+ fakemodel = {
376
+ 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
377
+ 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
378
+ }
379
+ fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
380
+ fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
381
+ if hf_config_path.exists():
382
+ params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
383
+ elif orig_config_path.exists():
384
+ params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
385
+ else:
386
+ raise ValueError('Unable to load metadata')
387
+ vocab = convert.load_vocab(
388
+ cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
389
+ cfg.vocabtype )
390
+ # FIXME: Respect cfg.vocab_dir?
391
+ svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
392
+ convert.check_vocab_size(params, vocab)
393
+ return (params, vocab, svocab)
394
+
395
+ def handle_args():
396
+ parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
397
+ parser.add_argument('--input', '-i', type = Path, required = True,
398
+ help = 'Input GGMLv3 filename')
399
+ parser.add_argument('--output', '-o', type = Path, required = True,
400
+ help ='Output GGUF filename')
401
+ parser.add_argument('--name',
402
+ help = 'Set model name')
403
+ parser.add_argument('--desc',
404
+ help = 'Set model description')
405
+ parser.add_argument('--gqa', type = int, default = 1,
406
+ help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
407
+ parser.add_argument('--eps', default = '5.0e-06',
408
+ help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
409
+ parser.add_argument('--context-length', '-c', type=int, default = 2048,
410
+ help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
411
+ parser.add_argument('--model-metadata-dir', '-m', type = Path,
412
+ help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
413
+ parser.add_argument("--vocab-dir", type=Path,
414
+ help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
415
+ parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
416
+ help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
417
+ return parser.parse_args()
418
+
419
+ def main():
420
+ cfg = handle_args()
421
+ print(f'* Using config: {cfg}')
422
+ print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
423
+ if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
424
+ print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
425
+ data = np.memmap(cfg.input, mode = 'r')
426
+ model = GGMLModel()
427
+ print('* Scanning GGML input file')
428
+ offset = model.load(data, 0)
429
+ print(f'* GGML model hyperparameters: {model.hyperparameters}')
430
+ vocab_override = None
431
+ params_override = None
432
+ special_vocab = None
433
+ if cfg.model_metadata_dir is not None:
434
+ (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
435
+ print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
436
+ print(f'* Overriding params: {params_override}')
437
+ print(f'* Overriding vocab: {vocab_override}')
438
+ print(f'* Special vocab: {special_vocab}')
439
+ else:
440
+ print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
441
+ if model.file_format == GGMLFormat.GGML:
442
+ print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
443
+ converter = GGMLToGGUF(model, data, cfg,
444
+ params_override = params_override,
445
+ vocab_override = vocab_override,
446
+ special_vocab = special_vocab )
447
+ converter.save()
448
+ print(f'* Successful completion. Output saved to: {cfg.output}')
449
+
450
+ if __name__ == '__main__':
451
+ main()
convert-lora-to-ggml.py CHANGED
@@ -1,28 +1,29 @@
1
- #!/usr/bin/env python
 
 
2
  import json
3
  import os
4
  import re
5
  import struct
6
  import sys
7
- from typing import Any, Dict, Sequence, TextIO
8
 
 
9
  import torch
10
 
11
- from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
 
12
 
13
  HF_SUBLAYER_TO_GGML = {
14
- "self_attn.q_proj": "attention.wq",
15
- "self_attn.k_proj": "attention.wk",
16
- "self_attn.v_proj": "attention.wv",
17
- "self_attn.o_proj": "attention.wo",
18
- "mlp.gate_proj": "feed_forward.w1",
19
- "mlp.down_proj": "feed_forward.w2",
20
- "mlp.up_proj": "feed_forward.w3",
21
- "input_layernorm": "attention_norm",
22
  "post_attention_layernorm": "ffn_norm",
23
- # "norm": "norm",
24
- # "embed_tokens": "tok_embeddings",
25
- # "lm_head": "output",
26
  }
27
 
28
 
@@ -39,7 +40,7 @@ def translate_tensor_name(t: str) -> str:
39
  sys.exit(1)
40
 
41
  output_string = (
42
- f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
43
  )
44
  return output_string
45
  else:
@@ -47,19 +48,21 @@ def translate_tensor_name(t: str) -> str:
47
  sys.exit(1)
48
 
49
 
50
- def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
51
  fout.write(b"ggla"[::-1]) # magic (ggml lora)
52
  fout.write(struct.pack("i", 1)) # file version
53
  fout.write(struct.pack("i", params["r"]))
54
  # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
55
  # but some models ship a float value instead
56
  # let's convert to int, but fail if lossless conversion is not possible
57
- assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
 
 
58
  fout.write(struct.pack("i", int(params["lora_alpha"])))
59
 
60
 
61
  def write_tensor_header(
62
- self, name: str, shape: Sequence[int], data_type: DataType
63
  ) -> None:
64
  sname = name.encode("utf-8")
65
  fout.write(
@@ -67,7 +70,7 @@ def write_tensor_header(
67
  "iii",
68
  len(shape),
69
  len(sname),
70
- DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
71
  )
72
  )
73
  fout.write(struct.pack("i" * len(shape), *shape[::-1]))
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
  import json
5
  import os
6
  import re
7
  import struct
8
  import sys
9
+ from typing import Any, BinaryIO, Sequence
10
 
11
+ import numpy as np
12
  import torch
13
 
14
+ NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
15
+
16
 
17
  HF_SUBLAYER_TO_GGML = {
18
+ "self_attn.q_proj": "attn_q",
19
+ "self_attn.k_proj": "attn_k",
20
+ "self_attn.v_proj": "attn_v",
21
+ "self_attn.o_proj": "attn_output",
22
+ "mlp.gate_proj": "ffn_gate",
23
+ "mlp.down_proj": "ffn_down",
24
+ "mlp.up_proj": "ffn_up",
25
+ "input_layernorm": "attn_norm",
26
  "post_attention_layernorm": "ffn_norm",
 
 
 
27
  }
28
 
29
 
 
40
  sys.exit(1)
41
 
42
  output_string = (
43
+ f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
44
  )
45
  return output_string
46
  else:
 
48
  sys.exit(1)
49
 
50
 
51
+ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
52
  fout.write(b"ggla"[::-1]) # magic (ggml lora)
53
  fout.write(struct.pack("i", 1)) # file version
54
  fout.write(struct.pack("i", params["r"]))
55
  # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
56
  # but some models ship a float value instead
57
  # let's convert to int, but fail if lossless conversion is not possible
58
+ assert (
59
+ int(params["lora_alpha"]) == params["lora_alpha"]
60
+ ), "cannot convert float to int losslessly"
61
  fout.write(struct.pack("i", int(params["lora_alpha"])))
62
 
63
 
64
  def write_tensor_header(
65
+ self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
66
  ) -> None:
67
  sname = name.encode("utf-8")
68
  fout.write(
 
70
  "iii",
71
  len(shape),
72
  len(sname),
73
+ NUMPY_TYPE_TO_FTYPE[data_type.name],
74
  )
75
  )
76
  fout.write(struct.pack("i" * len(shape), *shape[::-1]))
convert-starcoder-hf-to-gguf.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF starcoder --> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import torch
16
+ from transformers import AutoTokenizer # type: ignore[import]
17
+
18
+ if 'NO_LOCAL_GGUF' not in os.environ:
19
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20
+ import gguf
21
+
22
+
23
+ def bytes_to_unicode():
24
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
25
+ """
26
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
27
+ The reversible bpe codes work on unicode strings.
28
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
29
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
30
+ This is a significant percentage of your normal, say, 32K bpe vocab.
31
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
32
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
33
+ """
34
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
35
+ cs = bs[:]
36
+ n = 0
37
+ for b in range(2**8):
38
+ if b not in bs:
39
+ bs.append(b)
40
+ cs.append(2**8+n)
41
+ n += 1
42
+ return dict(zip(bs, (chr(n) for n in cs)))
43
+
44
+
45
+ def count_model_parts(dir_model: Path) -> int:
46
+ num_parts = 0
47
+ for filename in os.listdir(dir_model):
48
+ if filename.startswith("pytorch_model-"):
49
+ num_parts += 1
50
+
51
+ if num_parts > 0:
52
+ print("gguf: found " + str(num_parts) + " model parts")
53
+ return num_parts
54
+
55
+
56
+ def parse_args() -> argparse.Namespace:
57
+ parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
58
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
59
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
60
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
61
+ parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
62
+ return parser.parse_args()
63
+
64
+ args = parse_args()
65
+
66
+ dir_model = args.model
67
+ ftype = args.ftype
68
+ if not dir_model.is_dir():
69
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
70
+ sys.exit(1)
71
+
72
+ # possible tensor data types
73
+ # ftype == 0 -> float32
74
+ # ftype == 1 -> float16
75
+
76
+ # map from ftype to string
77
+ ftype_str = ["f32", "f16"]
78
+
79
+ if args.outfile is not None:
80
+ fname_out = args.outfile
81
+ else:
82
+ # output in the same directory as the model by default
83
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
84
+
85
+ print("gguf: loading model "+dir_model.name)
86
+
87
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
88
+ hparams = json.load(f)
89
+
90
+ if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
91
+ print("Model architecture not supported: " + hparams["architectures"][0])
92
+
93
+ sys.exit(1)
94
+
95
+ # get number of model parts
96
+ num_parts = count_model_parts(dir_model)
97
+
98
+ ARCH=gguf.MODEL_ARCH.STARCODER
99
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
100
+
101
+ print("gguf: get model metadata")
102
+
103
+ block_count = hparams["n_layer"]
104
+
105
+ gguf_writer.add_name("StarCoder")
106
+ gguf_writer.add_context_length(hparams["n_positions"])
107
+ gguf_writer.add_embedding_length(hparams["n_embd"])
108
+ gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
109
+ gguf_writer.add_block_count(block_count)
110
+ gguf_writer.add_head_count(hparams["n_head"])
111
+ gguf_writer.add_head_count_kv(1)
112
+ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
113
+ gguf_writer.add_file_type(ftype)
114
+
115
+ # TOKENIZATION
116
+
117
+ print("gguf: get tokenizer metadata")
118
+
119
+ tokens: list[bytearray] = []
120
+ scores: list[float] = []
121
+ toktypes: list[int] = []
122
+
123
+ tokenizer_json_file = dir_model / 'tokenizer.json'
124
+ if not tokenizer_json_file.is_file():
125
+ print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
126
+ sys.exit(1)
127
+
128
+ # gpt2 tokenizer
129
+ gguf_writer.add_tokenizer_model("gpt2")
130
+
131
+ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
132
+ tokenizer_json = json.load(f)
133
+
134
+ print("gguf: get gpt2 tokenizer vocab")
135
+
136
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
137
+ # This causes downstream issues with mismatched tensor sizes when running the inference
138
+ vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
139
+
140
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
141
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
142
+
143
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
144
+ byte_encoder = bytes_to_unicode()
145
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
146
+
147
+ for i in range(vocab_size):
148
+ if i in reverse_vocab:
149
+ try:
150
+ text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
151
+ except KeyError:
152
+ text = bytearray()
153
+ for c in reverse_vocab[i]:
154
+ if ord(c) < 256: # single byte character
155
+ text.append(byte_decoder[ord(c)])
156
+ else: # multibyte special token character
157
+ text.extend(c.encode('utf-8'))
158
+ else:
159
+ print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
160
+ pad_token = f"[PAD{i}]".encode("utf8")
161
+ text = bytearray(pad_token)
162
+
163
+ tokens.append(text)
164
+ scores.append(0.0) # dymmy
165
+ toktypes.append(gguf.TokenType.NORMAL) # dummy
166
+
167
+ gguf_writer.add_token_list(tokens)
168
+ gguf_writer.add_token_scores(scores)
169
+ gguf_writer.add_token_types(toktypes)
170
+
171
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
172
+ special_vocab.add_to_gguf(gguf_writer)
173
+
174
+ # TENSORS
175
+
176
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
177
+
178
+ # params for qkv transform
179
+ n_head = hparams["n_head"]
180
+ n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
181
+
182
+ head_dim = hparams["n_embd"] // n_head
183
+
184
+ # tensor info
185
+ print("gguf: get tensor metadata")
186
+
187
+ if num_parts == 0:
188
+ part_names = iter(("pytorch_model.bin",))
189
+ else:
190
+ part_names = (
191
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
192
+ )
193
+
194
+ for part_name in part_names:
195
+ if args.vocab_only:
196
+ break
197
+ print("gguf: loading model part '" + part_name + "'")
198
+ model_part = torch.load(dir_model / part_name, map_location="cpu")
199
+
200
+ for name in model_part.keys():
201
+ data = model_part[name]
202
+
203
+ old_dtype = data.dtype
204
+
205
+ # convert any unsupported data types to float32
206
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
207
+ data = data.to(torch.float32)
208
+
209
+ data = data.squeeze().numpy()
210
+
211
+ # map tensor names
212
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
213
+ if new_name is None:
214
+ print("Can not map tensor '" + name + "'")
215
+ sys.exit()
216
+
217
+ n_dims = len(data.shape)
218
+ data_dtype = data.dtype
219
+
220
+ # if f32 desired, convert any float16 to float32
221
+ if ftype == 0 and data_dtype == np.float16:
222
+ data = data.astype(np.float32)
223
+
224
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
225
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
226
+ data = data.astype(np.float32)
227
+
228
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
229
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
230
+ data = data.astype(np.float16)
231
+
232
+ print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
233
+
234
+ gguf_writer.add_tensor(new_name, data)
235
+
236
+
237
+ print("gguf: write header")
238
+ gguf_writer.write_header_to_file()
239
+ print("gguf: write metadata")
240
+ gguf_writer.write_kv_data_to_file()
241
+ if not args.vocab_only:
242
+ print("gguf: write tensors")
243
+ gguf_writer.write_tensors_to_file()
244
+
245
+ gguf_writer.close()
246
+
247
+ print(f"gguf: model successfully exported to '{fname_out}'")
248
+ print("")
convert.py CHANGED
@@ -1,4 +1,6 @@
1
- #!/usr/bin/env python
 
 
2
  import argparse
3
  import concurrent.futures
4
  import copy
@@ -15,141 +17,151 @@ import re
15
  import signal
16
  import struct
17
  import sys
 
18
  import zipfile
19
  from abc import ABCMeta, abstractmethod
 
20
  from dataclasses import dataclass
21
  from pathlib import Path
22
- from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
23
- Literal, Optional, Sequence, Tuple, TypeVar, Union)
24
 
25
  import numpy as np
26
- from sentencepiece import SentencePieceProcessor # type: ignore
 
 
 
 
 
27
 
28
  if TYPE_CHECKING:
29
- from typing_extensions import TypeAlias
30
 
31
  if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
32
  faulthandler.register(signal.SIGUSR1)
33
 
34
- NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
35
 
 
 
 
 
 
 
 
36
 
37
  @dataclass(frozen=True)
38
- class UnquantizedDataType:
39
  name: str
 
 
40
 
41
-
42
- DT_F16 = UnquantizedDataType('F16')
43
- DT_F32 = UnquantizedDataType('F32')
44
- DT_I32 = UnquantizedDataType('I32')
45
- DT_BF16 = UnquantizedDataType('BF16')
46
-
47
 
48
  @dataclass(frozen=True)
49
- class QuantizedDataType:
50
- groupsize: int
51
- have_addends: bool
52
- have_g_idx: bool
53
-
54
 
55
- DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False)
56
- DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False)
 
 
57
 
58
- DataType = Union[UnquantizedDataType, QuantizedDataType]
 
 
 
 
59
 
60
- DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
61
- DT_F32: 0,
62
- DT_F16: 1,
63
- DT_Q4_0: 2,
64
- DT_Q4_1: 3,
65
- }
66
 
67
- FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
68
- {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
 
69
 
70
- DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
71
- DT_BF16: np.dtype(np.uint16),
72
- DT_F16: np.dtype(np.float16),
73
- DT_F32: np.dtype(np.float32),
74
- DT_I32: np.dtype(np.int32),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  }
76
 
77
- NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
78
- {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
79
-
80
-
81
- class GGMLFileType(enum.Enum):
82
- AllF32 = 0
83
- MostlyF16 = 1 # except 1d tensors
84
- MostlyQ4_0 = 2 # except 1d tensors
85
- MostlyQ4_1 = 3 # except 1d tensors
86
- PerLayerIsQ4_1 = 4 # but tok_embeddings.weight and output.weight are F16
87
-
88
- def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
89
- if len(tensor.shape) == 1:
90
- # 1D tensors are always F32.
91
- return DT_F32
92
- elif self == GGMLFileType.AllF32:
93
- return DT_F32
94
- elif self == GGMLFileType.MostlyF16:
95
- return DT_F16
96
- elif self == GGMLFileType.MostlyQ4_0:
97
- return DT_Q4_0
98
- elif self == GGMLFileType.MostlyQ4_1:
99
- return DT_Q4_1
100
- elif self == GGMLFileType.PerLayerIsQ4_1:
101
- if name in ('output.weight', 'tok_embeddings.weight'):
102
- return DT_F16
103
- else:
104
- return DT_Q4_1
105
- else:
106
  raise ValueError(self)
 
 
107
 
 
 
 
 
 
108
 
109
- def make_tensors_list() -> List[str]:
110
- ret = [
111
- 'tok_embeddings.weight',
112
- 'norm.weight',
113
- 'output.weight',
114
- ]
115
- for i in range(80): # maximum number of layer
116
- ret += [
117
- f'layers.{i}.attention.wq.weight',
118
- f'layers.{i}.attention.wk.weight',
119
- f'layers.{i}.attention.wv.weight',
120
- f'layers.{i}.attention.wo.weight',
121
- f'layers.{i}.attention_norm.weight',
122
- f'layers.{i}.feed_forward.w1.weight',
123
- f'layers.{i}.feed_forward.w2.weight',
124
- f'layers.{i}.feed_forward.w3.weight',
125
- f'layers.{i}.ffn_norm.weight',
126
- ]
127
- return ret
128
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- TENSORS_LIST = make_tensors_list()
131
- TENSORS_SET = set(TENSORS_LIST)
132
 
 
133
 
134
- def find_n_mult(n_ff: int, n_embd: int) -> int:
135
- # hardcoded magic range
136
- for n_mult in range(8192, 1, -1):
137
- calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
138
- if calc_ff == n_ff:
139
- return n_mult
140
- raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
141
-
142
- @dataclass
143
- class Params:
144
- n_vocab: int
145
- n_embd: int
146
- n_mult: int
147
- n_head: int
148
- n_layer: int
149
- n_kv_head: Optional[int] # This parameter is only used for Llama 2
150
 
151
  @staticmethod
152
- def guessed(model: 'LazyModel') -> 'Params':
153
  # try transformer naming first
154
  n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
155
 
@@ -165,65 +177,110 @@ class Params:
165
  raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
166
  "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
167
 
168
- n_head=n_embd // 128 # guessed
 
 
 
 
 
169
 
170
  return Params(
171
- n_vocab = n_vocab,
172
- n_embd = n_embd,
173
- n_mult = 256,
174
- n_head = n_head,
175
- n_layer = n_layer,
176
- n_kv_head = None,
 
 
177
  )
178
 
179
  @staticmethod
180
- def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
181
  config = json.load(open(config_path))
182
 
183
- n_vocab = config["vocab_size"];
184
- n_embd = config["hidden_size"];
185
- n_head = config["num_attention_heads"];
186
- n_layer = config["num_hidden_layers"];
187
- n_ff = config["intermediate_size"];
188
- n_kv_head = config.get("num_key_value_heads")
 
 
 
 
 
 
 
 
189
 
190
- n_mult = find_n_mult(n_ff, n_embd);
 
 
 
 
 
 
191
 
192
  return Params(
193
- n_vocab = n_vocab,
194
- n_embd = n_embd,
195
- n_mult = n_mult,
196
- n_head = n_head,
197
- n_layer = n_layer,
198
- n_kv_head = n_kv_head,
 
 
 
 
199
  )
200
 
201
  # LLaMA v2 70B params.json
202
- # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
203
  @staticmethod
204
- def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
205
  config = json.load(open(config_path))
206
 
207
- n_vocab = config["vocab_size"];
208
- n_embd = config["dim"];
209
- n_head = config["n_heads"];
210
- n_layer = config["n_layers"];
211
- n_mult = config["multiple_of"];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
  if n_vocab == -1:
214
  n_vocab = model["tok_embeddings.weight"].shape[0]
215
 
 
 
 
216
  return Params(
217
- n_vocab = n_vocab,
218
- n_embd = n_embd,
219
- n_mult = n_mult,
220
- n_head = n_head,
221
- n_layer = n_layer,
222
- n_kv_head = None,
 
 
 
223
  )
224
 
225
  @staticmethod
226
- def load(model_plus: 'ModelPlus') -> 'Params':
227
  hf_config_path = model_plus.paths[0].parent / "config.json"
228
  orig_config_path = model_plus.paths[0].parent / "params.json"
229
 
@@ -231,33 +288,104 @@ class Params:
231
  params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
232
  elif orig_config_path.exists():
233
  params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
234
- else:
235
  params = Params.guessed(model_plus.model)
 
 
 
 
236
 
237
- print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
238
  return params
239
 
240
 
241
- class SentencePieceVocab:
242
- def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
243
- self.vocabtype = vocabtype
244
- if self.vocabtype == "bpe":
245
- self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
 
 
 
 
 
 
246
  else:
247
- self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
248
- added_tokens: Dict[str, int]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  if fname_added_tokens is not None:
250
- added_tokens = json.load(open(fname_added_tokens))
251
  else:
252
  added_tokens = {}
253
- if self.vocabtype == "bpe":
254
- vocab_size: int = len(self.sentencepiece_tokenizer)
255
- else:
256
- vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
257
  expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
258
- actual_ids = sorted(added_tokens.values())
259
  if expected_ids != actual_ids:
260
  raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
 
261
  items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
262
  self.added_tokens_list = [text for (text, idx) in items]
263
  self.vocab_size_base: int = vocab_size
@@ -265,126 +393,74 @@ class SentencePieceVocab:
265
  self.fname_tokenizer = fname_tokenizer
266
  self.fname_added_tokens = fname_added_tokens
267
 
268
- def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
269
  tokenizer = self.sentencepiece_tokenizer
270
- if self.vocabtype == "bpe":
271
- from transformers.models.gpt2 import tokenization_gpt2
272
- byte_encoder = tokenization_gpt2.bytes_to_unicode()
273
- byte_decoder = {v: k for k, v in byte_encoder.items()}
274
- for i, item in enumerate(tokenizer):
275
- text: bytes
276
- text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
277
- score: float = -i
278
- yield text, score
279
- else:
280
- for i in range(tokenizer.vocab_size()):
281
- text: bytes
282
- if tokenizer.is_unknown(i):
283
- text = " \u2047 ".encode("utf-8")
284
- elif tokenizer.is_control(i):
285
- text = b""
286
- elif tokenizer.is_byte(i):
287
- piece = tokenizer.id_to_piece(i)
288
- if len(piece) != 6:
289
- raise Exception(f"Invalid token: {piece}")
290
- byte_value = int(piece[3:-1], 16)
291
- text = struct.pack("B", byte_value)
292
- else:
293
- text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
294
- score: float = tokenizer.get_score(i)
295
- yield text, score
296
-
297
- def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
298
  for text in self.added_tokens_list:
299
  score = -1000.0
300
- yield text.encode("utf-8"), score
301
 
302
- def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
303
  yield from self.sentencepiece_tokens()
304
  yield from self.added_tokens()
305
 
306
  def __repr__(self) -> str:
307
  return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
308
 
 
309
 
310
- class GGMLVocab:
311
- def __init__(self, tokens: List[Tuple[bytes, float]]):
312
- self.tokens = tokens
313
- self.vocab_size = len(tokens)
314
 
315
- def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
316
- return self.tokens
317
-
318
- def __repr__(self) -> str:
319
- return f"<GGMLVocab with {self.vocab_size} tokens>"
320
-
321
-
322
- Vocab = Union[SentencePieceVocab, GGMLVocab]
323
-
324
-
325
- def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
326
- if n_kv_head is not None and n_head != n_kv_head:
327
- n_head //= n_kv_head
328
  return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
329
  .swapaxes(1, 2)
330
  .reshape(weights.shape))
331
 
332
 
333
- def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
334
- # First reinterpret each row from a list of int32s containing 8 values each
335
- # to a list of uint8s containing 2 values each.
336
- qvalues_pack8 = qvalues_pack32.view(np.uint8)
337
-
338
- # Then split out the two values per int8 (which requires an actual
339
- # conversion because numpy doesn't natively support int4s).
340
- qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
341
- qvalues[:, 0::2] = qvalues_pack8 & 0xf
342
- qvalues[:, 1::2] = qvalues_pack8 >> 4
343
-
344
- assert addends is None or addends.shape == scales.shape
345
- assert qvalues.shape[0] == scales.shape[0]
346
- assert qvalues.shape[1] % scales.shape[1] == 0
347
- if g_idx is None:
348
- repeat_count = qvalues.shape[1] // scales.shape[1]
349
- scales = scales[:, :, np.newaxis]
350
- if addends is not None:
351
- addends = addends[:, :, np.newaxis]
352
- # Reshape so that the below computation broadcasts over scales and addends:
353
- qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
354
- else:
355
- # In this case the scale and addend is selected for each column by g_idx:
356
- assert addends is not None
357
- scales = scales[:, g_idx]
358
- addends = addends[:, g_idx]
359
- if addends is None:
360
- # Q4_0
361
- qvalues = qvalues.view(np.int8)
362
- qvalues -= 8
363
- # And do the actual 'value = scale * qvalue + addend' computation.
364
- values = scales * qvalues
365
- if addends is not None:
366
- values += addends
367
- if g_idx is None:
368
- values.shape = (values.shape[0], values.shape[1] * values.shape[2])
369
- return values
370
-
371
-
372
  class Tensor(metaclass=ABCMeta):
373
  data_type: DataType
374
 
375
  @abstractmethod
376
- def astype(self, data_type: DataType) -> 'Tensor': ...
377
  @abstractmethod
378
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
379
  @abstractmethod
380
- def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
381
  @abstractmethod
382
- def part(self, n_part: int) -> 'UnquantizedTensor': ...
383
  @abstractmethod
384
- def to_ggml(self) -> 'GGMLCompatibleTensor': ...
385
 
386
 
387
- def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
388
  assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
389
  fp32_arr = bf16_arr.astype(np.uint32) << 16
390
  return fp32_arr.view(np.float32)
@@ -397,27 +473,27 @@ class UnquantizedTensor(Tensor):
397
  self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
398
 
399
  def astype(self, data_type: DataType) -> Tensor:
400
- dtype = DATA_TYPE_TO_NUMPY[data_type]
401
  if self.data_type == DT_BF16:
402
  self.ndarray = bf16_to_fp32(self.ndarray)
403
  return UnquantizedTensor(self.ndarray.astype(dtype))
404
 
405
- def to_ggml(self) -> 'UnquantizedTensor':
406
  return self
407
 
408
- def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
409
  r = self.ndarray.shape[0] // 3
410
- return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
411
 
412
- def part(self, n_part: int) -> 'UnquantizedTensor':
413
  r = self.ndarray.shape[0] // 3
414
  return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
415
 
416
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
417
- return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
418
 
419
 
420
- def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
421
  tensor = lazy_tensor.load()
422
  assert isinstance(tensor, UnquantizedTensor)
423
 
@@ -433,196 +509,24 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
433
  return tensor.ndarray
434
 
435
 
436
- class GGMLQuantizedTensor(Tensor):
437
- data_type: QuantizedDataType
438
-
439
- def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
440
- rows, columns = shape
441
- assert data_type in (DT_Q4_1, DT_Q4_0) # for now
442
- assert isinstance(data_type, QuantizedDataType) # redundant, but mypy complains without this
443
- assert columns % data_type.groupsize == 0
444
- words_in_block = 6 if data_type == DT_Q4_1 else 5
445
- self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block))
446
- self.shape = shape[:]
447
- self.data_type = data_type
448
-
449
- def astype(self, data_type: DataType) -> Tensor:
450
- if data_type == self.data_type:
451
- return self
452
- scales = self.ndarray[:, :, 0].view(np.float32)
453
- if self.data_type.have_addends:
454
- addends = self.ndarray[:, :, 1].view(np.float32)
455
- else:
456
- addends = None
457
- qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
458
-
459
- dq = dequantize_q4(qweights, scales, addends, g_idx=None)
460
- return UnquantizedTensor(dq).astype(data_type)
461
-
462
- def to_ggml(self) -> 'GGMLQuantizedTensor':
463
- return self
464
-
465
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
466
- return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
467
-
468
- def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
469
- r = self.ndarray.shape[0] // 3
470
- return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
471
-
472
- def part(self, n_part: int) -> 'UnquantizedTensor':
473
- r = self.ndarray.shape[0] // 3
474
- return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
475
-
476
- GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
477
-
478
-
479
- class DeferredPermutedTensor(Tensor):
480
- def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
481
- self.base = base
482
- self.n_head = n_head
483
- self.n_kv_head = n_kv_head
484
- self.data_type = self.base.data_type
485
-
486
- def astype(self, data_type: DataType) -> Tensor:
487
- return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
488
-
489
- def to_ggml(self) -> GGMLCompatibleTensor:
490
- return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
491
-
492
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
493
- raise Exception("shouldn't permute twice")
494
-
495
-
496
- class GPTQForLLaMaQuantizedTensor(Tensor):
497
- def __init__(self, model: 'LazyModel', namebase: str) -> None:
498
- qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
499
- scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
500
-
501
- bias = model.get(f"{namebase}.bias")
502
- if bias is not None:
503
- # Q4_1 does not support bias; good thing the bias is always all zeros.
504
- assert not np.any(load_unquantized(bias))
505
-
506
- if f"{namebase}.zeros" in model:
507
- zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
508
- else:
509
- qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
510
- assert qzeros.dtype == np.int32
511
- zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
512
- assert zeros.dtype == np.float32
513
-
514
- assert zeros.shape == scales.shape
515
-
516
- # Output is transposed compared to the input, and addends have their sign flipped.
517
- # Scales and zeros similarly must be transposed but only for newer
518
- # versions of GPTQ-for-LLaMa; the older versions can be identified by
519
- # having shape (n_embd, 1).
520
- qweight = qweight.T
521
- if scales.shape[1] != 1:
522
- scales = scales.T
523
- zeros = zeros.T
524
-
525
- # Output also has signs flipped for the addends.
526
- self.qweight = qweight
527
- self.scales = scales
528
- self.addends = -zeros
529
-
530
- self.g_idx: Optional[NDArray]
531
- if f"{namebase}.g_idx" in model:
532
- self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
533
- assert self.g_idx.shape == (qweight.shape[1] * 8,)
534
- else:
535
- self.g_idx = None
536
-
537
- self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
538
- self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True,
539
- have_g_idx=(self.g_idx is not None))
540
-
541
- def inspect(self, row: int, col: int) -> None:
542
- '''For debugging.'''
543
- qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf
544
- if self.g_idx is not None:
545
- group = self.g_idx[col]
546
- else:
547
- group = int(col // self.groupsize())
548
- scale = self.scales[row, group]
549
- addend = self.addends[row, group]
550
- with np.printoptions(precision=None, suppress=True):
551
- print(f'scale:{scale} addend:{addend} qweight:{qweight}')
552
- print('possible values:', np.arange(16) * scale + addend)
553
- print('actual value:', qweight * scale + addend)
554
-
555
- def astype(self, data_type: DataType) -> Tensor:
556
- if isinstance(data_type, QuantizedDataType):
557
- assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
558
- return self.regroup(data_type.groupsize)
559
-
560
- dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
561
- return UnquantizedTensor(dequantized).astype(data_type)
562
-
563
- def groupsize(self) -> int:
564
- assert self.addends.shape == self.scales.shape
565
- assert self.shape[1] % self.scales.shape[1] == 0
566
- return self.shape[1] // self.scales.shape[1]
567
-
568
- def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
569
- # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
570
- # columns in a row. Newer versions share them between every set of N
571
- # columns in a row, where N is the `groupsize` parameter, usually 128. The
572
- # output format shares them between every set of 32 columns. To handle
573
- # this, duplicate scales and addends for every smaller group.
574
- # (In the above, 'row' and 'column' are in the sense of the output.)
575
- assert self.g_idx is None
576
- old_groupsize = self.groupsize()
577
- assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
578
- ret = copy.copy(self)
579
- ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
580
- ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
581
- ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
582
- return ret
583
-
584
- def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
585
- return DeferredPermutedTensor(self, n_head, n_kv_head)
586
-
587
- def to_ggml(self) -> GGMLQuantizedTensor:
588
- # The output format looks like this:
589
- # For each row:
590
- # For each group of 32 columns:
591
- # - addend (float32, 4 bytes)
592
- # - scale (float32, 4 bytes)
593
- # - weights (int4 * 32, 16 bytes)
594
-
595
- if self.groupsize() != 32:
596
- raise Exception("should have been regrouped before converting to ggml")
597
-
598
- # Since the output format is mixed between integers and floats, we have
599
- # to hackily view the floats as int32s just so numpy will let us
600
- # concatenate them.
601
- addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
602
- scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
603
-
604
- # Split into groups of 4 columns (i.e. 32 columns of quantized data):
605
- grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
606
-
607
- # And concatenate:
608
- grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no')
609
-
610
- return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1)
611
 
612
 
613
  @dataclass
614
  class LazyTensor:
615
  _load: Callable[[], Tensor]
616
- shape: List[int]
617
  data_type: DataType
618
  description: str
619
 
620
  def load(self) -> Tensor:
621
  ret = self._load()
622
- assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
 
 
623
  return ret
624
 
625
- def astype(self, data_type: DataType) -> 'LazyTensor':
626
  self.validate_conversion_to(data_type)
627
 
628
  def load() -> Tensor:
@@ -630,39 +534,28 @@ class LazyTensor:
630
  return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
631
 
632
  def validate_conversion_to(self, data_type: DataType) -> None:
633
- if data_type == self.data_type:
634
- return
635
- if isinstance(data_type, QuantizedDataType):
636
- if not isinstance(self.data_type, QuantizedDataType):
637
- raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
638
- if self.data_type.have_g_idx:
639
- sys.stderr.write(
640
- "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
641
- "which is not yet natively supported by GGML. "
642
- "For now you can still convert this model by passing `--outtype f16` to dequantize, "
643
- "but that will result in a much larger output file for no quality benefit.\n")
644
- sys.exit(1)
645
- assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
646
 
647
 
648
- LazyModel = Dict[str, LazyTensor]
649
 
650
 
651
  @dataclass
652
  class ModelPlus:
653
  model: LazyModel
654
- paths: List[Path] # Where this was read from.
655
- format: Literal['ggml', 'torch', 'safetensors']
656
- vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab.
657
 
658
 
659
- def merge_sharded(models: List[LazyModel]) -> LazyModel:
660
  # Original LLaMA models have each file contain one part of each tensor.
661
  # Use a dict instead of a set to preserve order.
662
  names = {name: None for model in models for name in model}
663
 
664
  def convert(name: str) -> LazyTensor:
665
- lazy_tensors: List[LazyTensor] = [model[name] for model in models]
666
  if len(lazy_tensors) == 1:
667
  # only one file; don't go through this procedure since there might
668
  # be quantized tensors
@@ -690,7 +583,7 @@ def merge_sharded(models: List[LazyModel]) -> LazyModel:
690
  return {name: convert(name) for name in names}
691
 
692
 
693
- def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
694
  formats = set(mp.format for mp in models_plus)
695
  assert len(formats) == 1, "different formats?"
696
  format = formats.pop()
@@ -713,17 +606,17 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
713
  return ModelPlus(model, paths, format, vocab)
714
 
715
 
716
- def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
717
  def load() -> Tensor:
718
- return lazy_tensor.load().permute(n_head, n_kv_head)
719
- return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
720
 
721
- def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
722
  def load() -> Tensor:
723
- return lazy_tensor.load().permute_part(n_part, n_head)
724
  s = lazy_tensor.shape.copy()
725
  s[0] = s[0] // 3
726
- return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
727
 
728
  def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
729
  def load() -> Tensor:
@@ -732,66 +625,6 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
732
  s[0] = s[0] // 3
733
  return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
734
 
735
- def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
736
- out: LazyModel = {}
737
- out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
738
- out["norm.weight"] = model["model.norm.weight"]
739
- out["output.weight"] = model["lm_head.weight"]
740
-
741
- for i in itertools.count():
742
- if f"model.layers.{i}.self_attn.q_proj.weight" in model:
743
- out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
744
- out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
745
- out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
746
- elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
747
- out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
748
- out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
749
- out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
750
- else:
751
- break
752
-
753
- out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
754
-
755
- out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
756
- out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
757
- out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
758
-
759
- out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
760
- out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
761
- return out
762
-
763
-
764
- def handle_quantization(model: LazyModel) -> LazyModel:
765
- '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
766
- (which resolve to UnquantizedTensors with the raw data) to one with entries
767
- for 'foo.weight' (which resolve to QuantizedTensors).
768
- '''
769
- def convert(name: str) -> Tuple[str, LazyTensor]:
770
- if name.endswith(".qweight"):
771
- namebase = name.rsplit('.', 1)[0]
772
- orig_name = namebase + ".weight"
773
-
774
- lazy_tensor = model[name]
775
- assert len(lazy_tensor.shape) == 2
776
- real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
777
-
778
- # Calculate type. This replicates the logic in
779
- # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
780
- # actually loaded).
781
- lazy_scales = model[f"{namebase}.scales"]
782
- scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
783
- assert real_shape[1] % scales_width == 0
784
- groupsize = real_shape[1] // scales_width
785
- have_g_idx = f"{namebase}.g_idx" in model
786
- data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
787
-
788
- def load() -> Tensor:
789
- return GPTQForLLaMaQuantizedTensor(model, namebase)
790
-
791
- return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]'))
792
- else:
793
- return (name, model[name])
794
- return dict(convert(name) for name in model)
795
 
796
  # Functionality that simulates `torch.load` but where individual tensors are
797
  # only loaded into memory on demand, not all at once.
@@ -824,13 +657,11 @@ class LazyUnpickler(pickle.Unpickler):
824
  assert isinstance(pid[1], LazyStorageKind)
825
  data_type = pid[1].data_type
826
  filename_stem = pid[2]
827
- filename = self.data_base_path + '/' + filename_stem
828
  info = self.zip_file.getinfo(filename)
829
 
830
  def load(offset: int, elm_count: int) -> NDArray:
831
- dtype = DATA_TYPE_TO_NUMPY.get(data_type)
832
- if dtype is None:
833
- raise Exception("tensor stored in unsupported format")
834
  fp = self.zip_file.open(info)
835
  fp.seek(offset * dtype.itemsize)
836
  size = elm_count * dtype.itemsize
@@ -840,9 +671,8 @@ class LazyUnpickler(pickle.Unpickler):
840
  description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
841
  return LazyStorage(load=load, kind=pid[1], description=description)
842
 
843
- # @staticmethod
844
  def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
845
- # pyright: ignore[reportSelfClsParameterName]
846
  requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
847
  assert isinstance(storage, LazyStorage)
848
 
@@ -852,13 +682,15 @@ class LazyUnpickler(pickle.Unpickler):
852
  description = f'pickled storage_offset={storage_offset} in {storage.description}'
853
  return LazyTensor(load, list(size), storage.kind.data_type, description)
854
 
855
- # @staticmethod
856
  def rebuild_from_type_v2(func, new_type, args, state):
857
  return func(*args)
858
 
859
- CLASSES: Dict[Any, Any] = {
860
- ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
861
- ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
 
 
862
  ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
863
  ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
864
  ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
@@ -885,25 +717,17 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
885
  return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
886
 
887
 
888
- SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
889
- 'BF16': DT_BF16,
890
- 'F16': DT_F16,
891
- 'F32': DT_F32,
892
- 'I32': DT_I32,
893
- }
894
-
895
-
896
  def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
897
  header_size, = struct.unpack('<Q', fp.read(8))
898
- header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
899
  # Use mmap for the actual data to avoid race conditions with the file offset.
900
  mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
901
  byte_buf = mapped[8 + header_size:]
902
 
903
- def convert(info: Dict[str, Any]) -> LazyTensor:
904
  data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
905
- numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
906
- shape: List[int] = info['shape']
907
  begin, end = info['data_offsets']
908
  assert 0 <= begin <= end <= len(byte_buf)
909
  assert end - begin == math.prod(shape) * numpy_dtype.itemsize
@@ -924,84 +748,6 @@ def must_read(fp: IO[bytes], length: int) -> bytes:
924
  return ret
925
 
926
 
927
- def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
928
- magic = must_read(fp, 4)[::-1]
929
- if magic in (b'ggmf', b'ggjt'):
930
- version, = struct.unpack("i", must_read(fp, 4))
931
- assert version == 1
932
- else:
933
- assert magic == b'ggml'
934
- version = None
935
- n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28))
936
-
937
- tokens: List[Tuple[bytes, float]] = []
938
- for i in range(n_vocab):
939
- if i == 32000:
940
- # HACK: GPT4All messed with the format without changing the magic
941
- # number. Specifically, they changed the vocab section to contain
942
- # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
943
- # extra pad token). Try to detect if we're reading a file like
944
- # this.
945
- orig_pos = fp.tell()
946
- fp.seek(20, io.SEEK_CUR)
947
- is_gpt4all = fp.read(21) == b'tok_embeddings.weight'
948
- fp.seek(orig_pos)
949
- if is_gpt4all:
950
- break
951
-
952
- length, = struct.unpack("i", must_read(fp, 4))
953
- text = must_read(fp, length)
954
- if magic != b'ggml':
955
- score, = struct.unpack("f", must_read(fp, 4))
956
- tokens.append((text, score))
957
- vocab = GGMLVocab(tokens) if magic != b'ggml' else None
958
-
959
- model: LazyModel = {}
960
- # Use mmap for the actual data to avoid race conditions with the file offset.
961
- off = fp.raw.tell()
962
- mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
963
- fp.raw.seek(off) # needed on Windows
964
-
965
- def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
966
- shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
967
- assert 0 <= shape_len <= 3
968
- shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
969
- shape = shape[::-1]
970
- name = must_read(fp, name_len).decode('utf-8')
971
- data_type = FTYPE_TO_DATA_TYPE[ftype]
972
-
973
- if magic == b'ggjt':
974
- fp.seek((fp.tell() + 31) & -32)
975
-
976
- if data_type == DT_Q4_1:
977
- # See GPTQForLLaMaQuantizedTensor.ggml_ndarray()
978
- size = 24 * (shape[1] // 32) * shape[0]
979
- elif data_type == DT_Q4_0:
980
- size = 20 * (shape[1] // 32) * shape[0]
981
- else:
982
- numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
983
- elm_count = math.prod(shape)
984
- size = elm_count * numpy_dtype.itemsize
985
- offset = fp.tell()
986
- buf = mapped[offset:offset+size]
987
- fp.seek(size, io.SEEK_CUR)
988
-
989
- def load() -> Tensor:
990
- if isinstance(data_type, QuantizedDataType):
991
- ndarray = np.frombuffer(buf, dtype=np.uint32)
992
- return GGMLQuantizedTensor(ndarray, shape, data_type)
993
- else:
994
- return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
995
- description = f'ggml offset={offset} type={data_type} path={path}'
996
- model[name] = LazyTensor(load, shape, data_type, description)
997
-
998
- while fp.read(1) != b'':
999
- fp.seek(-1, io.SEEK_CUR)
1000
- read_tensor()
1001
-
1002
- return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab)
1003
-
1004
-
1005
  @functools.lru_cache(maxsize=None)
1006
  def lazy_load_file(path: Path) -> ModelPlus:
1007
  fp = open(path, 'rb')
@@ -1010,9 +756,6 @@ def lazy_load_file(path: Path) -> ModelPlus:
1010
  if first8[:2] == b'PK':
1011
  # A zip file, i.e. PyTorch format
1012
  return lazy_load_torch_file(fp, path)
1013
- elif first8[2:4] == b'gg':
1014
- # GGML format
1015
- return lazy_load_ggml_file(fp, path)
1016
  elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
1017
  # Probably safetensors
1018
  return lazy_load_safetensors_file(fp, path)
@@ -1023,28 +766,43 @@ def lazy_load_file(path: Path) -> ModelPlus:
1023
  In = TypeVar('In')
1024
  Out = TypeVar('Out')
1025
 
1026
-
1027
- def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
1028
  '''Parallel map, but with backpressure. If the caller doesn't call `next`
1029
  fast enough, this will stop calling `func` at some point rather than
1030
  letting results pile up in memory. Specifically, there is a max of one
1031
  output value buffered per thread.'''
1032
- with concurrent.futures.ThreadPoolExecutor() as executor:
1033
- futures: List[concurrent.futures.Future[Out]] = []
1034
- items_rev = list(iterable)[::-1]
1035
- for i in range(min(concurrency, len(items_rev))):
1036
- futures.append(executor.submit(func, items_rev.pop()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
  while futures:
1038
  result = futures.pop(0).result()
1039
- if items_rev:
1040
- futures.append(executor.submit(func, items_rev.pop()))
 
 
 
 
1041
  yield result
1042
 
1043
-
1044
  def check_vocab_size(params: Params, vocab: Vocab) -> None:
1045
  if params.n_vocab != vocab.vocab_size:
1046
- # GGMLVocab comes from the same file as the model so shouldn't mismatch:
1047
- assert isinstance(vocab, SentencePieceVocab)
1048
  if params.n_vocab == vocab.vocab_size_base:
1049
  print("Ignoring added_tokens.json since model matches vocab size without it.")
1050
  vocab.added_tokens_list = []
@@ -1061,105 +819,200 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
1061
 
1062
  class OutputFile:
1063
  def __init__(self, fname_out: Path) -> None:
1064
- self.fout = open(fname_out, "wb")
1065
-
1066
- def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
1067
- self.fout.write(b"ggjt"[::-1]) # magic
1068
- values = [
1069
- 1, # file version
1070
- params.n_vocab,
1071
- params.n_embd,
1072
- params.n_mult,
1073
- params.n_head,
1074
- params.n_layer,
1075
- params.n_embd // params.n_head, # rot (obsolete)
1076
- file_type.value,
1077
- ]
1078
- self.fout.write(struct.pack("i" * len(values), *values))
1079
-
1080
- def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
1081
- sname = name.encode('utf-8')
1082
- self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
1083
- self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
1084
- self.fout.write(sname)
1085
- self.fout.seek((self.fout.tell() + 31) & -32)
1086
-
1087
- def write_vocab(self, vocab: Vocab) -> None:
1088
- for text, score in vocab.all_tokens():
1089
- self.fout.write(struct.pack("i", len(text)))
1090
- self.fout.write(text)
1091
- self.fout.write(struct.pack("f", score))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
 
1093
  @staticmethod
1094
- def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
1095
- of = OutputFile(fname_out)
1096
- params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
1097
  of = OutputFile(fname_out)
1098
- of.write_file_header(params, file_type=GGMLFileType.AllF32)
1099
- of.write_vocab(vocab)
1100
- of.fout.close()
 
 
 
 
 
 
 
 
 
 
 
 
1101
 
1102
  @staticmethod
1103
- def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
 
 
 
 
 
 
 
1104
  check_vocab_size(params, vocab)
 
1105
  of = OutputFile(fname_out)
1106
- of.write_file_header(params, file_type)
1107
- print("Writing vocab...")
1108
- of.write_vocab(vocab)
1109
 
1110
- def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
1111
- name, lazy_tensor = item
1112
- return lazy_tensor.load().to_ggml().ndarray
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1113
 
1114
- ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
1115
  for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
 
1116
  size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
1117
  padi = len(str(len(model)))
1118
- print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
1119
- of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
1120
- ndarray.tofile(of.fout)
1121
- of.fout.close()
1122
 
 
1123
 
1124
- def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
1125
- wq_type = model["layers.0.attention.wq.weight"].data_type
1126
- if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
 
1127
  return GGMLFileType.AllF32
1128
- if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
1129
  return GGMLFileType.MostlyF16
1130
- if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and
1131
- wq_type.have_addends):
1132
- if isinstance(model["output.weight"].data_type, QuantizedDataType):
1133
- return GGMLFileType.MostlyQ4_1
1134
- else:
1135
- return GGMLFileType.PerLayerIsQ4_1
1136
- if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
1137
- return GGMLFileType.MostlyQ4_0
1138
  name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
 
1139
  raise Exception(f"Unexpected combination of types: {name_to_type}")
1140
 
 
 
 
1141
 
1142
- def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
1143
- model = handle_quantization(model)
 
1144
 
1145
- if "lm_head.weight" in model:
1146
- model = convert_transformers_to_orig(model, params)
1147
- model = filter_and_sort_tensors(model)
1148
 
1149
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1150
 
 
 
 
 
 
1151
 
1152
- def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
1153
- return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
1154
- for (name, tensor) in model.items()}
1155
 
 
 
1156
 
1157
- def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
 
 
1158
  '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1159
  the nth path in the model.
1160
  '''
1161
  # Support the following patterns:
1162
- patterns: List[Tuple[str, str]] = [
1163
  # - x.00.pth, x.01.pth, etc.
1164
  (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
1165
  # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@@ -1175,11 +1028,11 @@ def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
1175
  return None
1176
 
1177
 
1178
- def find_multifile_paths(path: Path) -> List[Path]:
1179
  '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1180
  the whole list of paths in the model.
1181
  '''
1182
- ret: List[Path] = []
1183
  for i in itertools.count():
1184
  nth_path = nth_multifile_path(path, i)
1185
  if nth_path is None:
@@ -1203,11 +1056,6 @@ def load_some_model(path: Path) -> ModelPlus:
1203
  # Try the PyTorch patterns too, with lower priority
1204
  globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
1205
  files = [file for glob in globs for file in path.glob(glob)]
1206
- if not files:
1207
- # Try GGML too, but with lower priority, since if both a non-GGML
1208
- # model and a GGML model exist in the same directory, we assume the
1209
- # latter was converted from the former.
1210
- files = list(path.glob("ggml-model*.bin*"))
1211
  if not files:
1212
  raise Exception(f"Can't find model in directory {path}")
1213
  if len(files) > 1:
@@ -1215,7 +1063,7 @@ def load_some_model(path: Path) -> ModelPlus:
1215
  path = files[0]
1216
 
1217
  paths = find_multifile_paths(path)
1218
- models_plus: List[ModelPlus] = []
1219
  for path in paths:
1220
  print(f"Loading model file {path}")
1221
  models_plus.append(lazy_load_file(path))
@@ -1224,19 +1072,14 @@ def load_some_model(path: Path) -> ModelPlus:
1224
  return model_plus
1225
 
1226
 
1227
- def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
1228
- return {name: model[name] for name in TENSORS_LIST if name in model}
1229
-
1230
-
1231
- def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
1232
- print(f"vocabtype: {vocabtype}")
1233
  # Be extra-friendly and accept either a file or a directory. Also, if it's
1234
  # a directory, it might be the model directory, and tokenizer.model might
1235
  # be in the parent of that.
1236
  if path.is_dir():
1237
  vocab_file = "tokenizer.model"
1238
  if vocabtype == 'bpe':
1239
- vocab_file = "vocab.json"
1240
  path2 = path / vocab_file
1241
  # Use `.parent` instead of /.. to handle the symlink case better.
1242
  path3 = path.parent / vocab_file
@@ -1246,23 +1089,27 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
1246
  path = path3
1247
  else:
1248
  raise FileNotFoundError(
1249
- f"Could not find tokenizer.model in {path} or its parent; "
1250
  "if it's in another directory, pass the directory as --vocab-dir")
 
 
 
1251
  added_tokens_path = path.parent / "added_tokens.json"
1252
- print(f"Loading vocab file {path}")
1253
- return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
1254
- vocabtype)
 
 
 
1255
 
1256
 
1257
- def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
1258
  namestr = {
1259
- GGMLFileType.AllF32: "f32",
1260
  GGMLFileType.MostlyF16: "f16",
1261
- GGMLFileType.MostlyQ4_0: "q4_0",
1262
- GGMLFileType.MostlyQ4_1: "q4_1",
1263
- GGMLFileType.PerLayerIsQ4_1: "q4_1",
1264
  }[file_type]
1265
- ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
1266
  if ret in model_paths:
1267
  sys.stderr.write(
1268
  f"Error: Default output path ({ret}) would overwrite the input. "
@@ -1279,47 +1126,82 @@ def do_dump_model(model_plus: ModelPlus) -> None:
1279
  print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
1280
 
1281
 
1282
- def main(args_in: Optional[List[str]] = None) -> None:
1283
  parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
1284
- parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
1285
- parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
1286
- parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
1287
- parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
1288
- parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1289
- parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
1290
- parser.add_argument("model", type=Path,
1291
- help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1292
- parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
 
1293
  args = parser.parse_args(args_in)
1294
 
1295
- vocab: Vocab
1296
  if args.dump_single:
1297
  model_plus = lazy_load_file(args.model)
1298
  do_dump_model(model_plus)
1299
- elif args.vocab_only:
1300
- vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1301
  assert args.outfile, "need --outfile if using --vocab-only"
 
 
 
1302
  outfile = args.outfile
1303
- OutputFile.write_vocab_only(outfile, vocab)
1304
  print(f"Wrote {outfile}")
 
 
 
 
1305
  else:
1306
- model_plus = load_some_model(args.model)
1307
- if args.dump:
1308
- do_dump_model(model_plus)
1309
- return
1310
- if model_plus.vocab is not None and args.vocab_dir is None:
1311
- vocab = model_plus.vocab
1312
- else:
1313
- vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
1314
- vocab = load_vocab(vocab_dir, args.vocabtype)
1315
- params = Params.load(model_plus)
1316
- model = model_plus.model
1317
- model = do_necessary_conversions(model, params)
1318
- output_type = pick_output_type(model, args.outtype)
1319
- model = convert_to_output_type(model, output_type)
1320
- outfile = args.outfile or default_outfile(model_plus.paths, output_type)
1321
- OutputFile.write_all(outfile, params, output_type, model, vocab)
1322
- print(f"Wrote {outfile}")
1323
 
1324
 
1325
  if __name__ == '__main__':
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
  import argparse
5
  import concurrent.futures
6
  import copy
 
17
  import signal
18
  import struct
19
  import sys
20
+ import time
21
  import zipfile
22
  from abc import ABCMeta, abstractmethod
23
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
24
  from dataclasses import dataclass
25
  from pathlib import Path
26
+ from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
 
27
 
28
  import numpy as np
29
+ from sentencepiece import SentencePieceProcessor # type: ignore[import]
30
+
31
+ import os
32
+ if 'NO_LOCAL_GGUF' not in os.environ:
33
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
34
+ import gguf
35
 
36
  if TYPE_CHECKING:
37
+ from typing import TypeAlias
38
 
39
  if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
40
  faulthandler.register(signal.SIGUSR1)
41
 
42
+ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
43
 
44
+ ARCH=gguf.MODEL_ARCH.LLAMA
45
+ NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
46
+
47
+ DEFAULT_CONCURRENCY = 8
48
+ #
49
+ # data types
50
+ #
51
 
52
  @dataclass(frozen=True)
53
+ class DataType:
54
  name: str
55
+ dtype: np.dtype[Any]
56
+ valid_conversions: list[str]
57
 
58
+ def elements_to_bytes(self, n_elements: int) -> int:
59
+ return n_elements * self.dtype.itemsize
 
 
 
 
60
 
61
  @dataclass(frozen=True)
62
+ class UnquantizedDataType(DataType):
63
+ pass
 
 
 
64
 
65
+ DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
66
+ DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
67
+ DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
68
+ DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
69
 
70
+ @dataclass(frozen=True)
71
+ class QuantizedDataType(DataType):
72
+ block_size: int
73
+ quantized_dtype: np.dtype[Any]
74
+ ggml_type: gguf.GGMLQuantizationType
75
 
76
+ def quantize(self, arr: NDArray) -> NDArray:
77
+ raise NotImplementedError(f'Quantization for {self.name} not implemented')
 
 
 
 
78
 
79
+ def elements_to_bytes(self, n_elements: int) -> int:
80
+ assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
81
+ return self.quantized_dtype.itemsize * (n_elements // self.block_size)
82
 
83
+ @dataclass(frozen=True)
84
+ class Q8_0QuantizedDataType(QuantizedDataType):
85
+ # Mini Q8_0 quantization in Python!
86
+ def quantize(self, arr: NDArray) -> NDArray:
87
+ assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
88
+ assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
89
+ n_blocks = arr.size // self.block_size
90
+ blocks = arr.reshape((n_blocks, self.block_size))
91
+ # Much faster implementation of block quantization contributed by @Cebtenzzre
92
+ def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
93
+ d = abs(blocks).max(axis = 1) / np.float32(127)
94
+ with np.errstate(divide = 'ignore'):
95
+ qs = (blocks / d[:, None]).round()
96
+ qs[d == 0] = 0
97
+ yield from zip(d, qs)
98
+ return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
99
+
100
+ DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
101
+ dtype = np.dtype(np.float32), valid_conversions = [],
102
+ ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
103
+ quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
104
+
105
+ # Quantized types skipped here because they may also map to np.float32
106
+ NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
107
+ for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
108
+ if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
109
+ raise ValueError(f'Invalid duplicate data type {dt}')
110
+ NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
111
+
112
+ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
113
+ 'BF16': DT_BF16,
114
+ 'F16': DT_F16,
115
+ 'F32': DT_F32,
116
+ 'I32': DT_I32,
117
  }
118
 
119
+ # TODO: match this with `llama_ftype`
120
+ # TODO: rename to LLAMAFileType
121
+ # TODO: move to `gguf.py`
122
+ class GGMLFileType(enum.IntEnum):
123
+ AllF32 = 0
124
+ MostlyF16 = 1 # except 1d tensors
125
+ MostlyQ8_0 = 7 # except 1d tensors
126
+
127
+ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
128
+ dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
129
+ if dt is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  raise ValueError(self)
131
+ # 1D tensors are always F32.
132
+ return dt if len(tensor.shape) > 1 else DT_F32
133
 
134
+ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
135
+ GGMLFileType.AllF32 : DT_F32,
136
+ GGMLFileType.MostlyF16 : DT_F16,
137
+ GGMLFileType.MostlyQ8_0: DT_Q8_0,
138
+ }
139
 
140
+ #
141
+ # hparams loading
142
+ #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ @dataclass
145
+ class Params:
146
+ n_vocab: int
147
+ n_embd: int
148
+ n_layer: int
149
+ n_ctx: int
150
+ n_ff: int
151
+ n_head: int
152
+ n_head_kv: int
153
+ f_norm_eps: float
154
 
155
+ f_rope_freq_base: float | None = None
156
+ f_rope_scale: float | None = None
157
 
158
+ ftype: GGMLFileType | None = None
159
 
160
+ # path to the directory containing the model files
161
+ path_model: Path | None = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  @staticmethod
164
+ def guessed(model: LazyModel) -> Params:
165
  # try transformer naming first
166
  n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
167
 
 
177
  raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
178
  "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
179
 
180
+ n_head = n_embd // 128 # guessed
181
+ n_mult = 256 # guessed
182
+
183
+ # TODO: verify this
184
+ n_ff = int(2 * (4 * n_embd) / 3)
185
+ n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
186
 
187
  return Params(
188
+ n_vocab = n_vocab,
189
+ n_embd = n_embd,
190
+ n_layer = n_layer,
191
+ n_ctx = -1,
192
+ n_ff = n_ff,
193
+ n_head = n_head,
194
+ n_head_kv = n_head,
195
+ f_norm_eps = 1e-5,
196
  )
197
 
198
  @staticmethod
199
+ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
200
  config = json.load(open(config_path))
201
 
202
+ n_vocab = config["vocab_size"]
203
+ n_embd = config["hidden_size"]
204
+ n_layer = config["num_hidden_layers"]
205
+ n_ff = config["intermediate_size"]
206
+ n_head = config["num_attention_heads"]
207
+ n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
208
+ f_norm_eps = config["rms_norm_eps"]
209
+ f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
210
+
211
+ rope_scaling = config.get("rope_scaling")
212
+ if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
213
+ f_rope_scale = config["rope_scaling"].get("factor")
214
+ else:
215
+ f_rope_scale = None
216
 
217
+ if "max_sequence_length" in config:
218
+ n_ctx = config["max_sequence_length"]
219
+ elif "max_position_embeddings" in config:
220
+ n_ctx = config["max_position_embeddings"]
221
+ else:
222
+ raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
223
+ "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
224
 
225
  return Params(
226
+ n_vocab = n_vocab,
227
+ n_embd = n_embd,
228
+ n_layer = n_layer,
229
+ n_ctx = n_ctx,
230
+ n_ff = n_ff,
231
+ n_head = n_head,
232
+ n_head_kv = n_head_kv,
233
+ f_norm_eps = f_norm_eps,
234
+ f_rope_freq_base = f_rope_freq_base,
235
+ f_rope_scale = f_rope_scale,
236
  )
237
 
238
  # LLaMA v2 70B params.json
239
+ # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
240
  @staticmethod
241
+ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
242
  config = json.load(open(config_path))
243
 
244
+ n_vocab = config["vocab_size"] if "vocab_size" in config else -1
245
+ n_embd = config["dim"]
246
+ n_layer = config["n_layers"]
247
+ n_ff = -1
248
+ n_head = config["n_heads"]
249
+ n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
250
+ f_norm_eps = config["norm_eps"]
251
+ f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
252
+
253
+ # hack to determine LLaMA v1 vs v2 vs CodeLlama
254
+ if f_rope_freq_base == 1000000:
255
+ # CodeLlama
256
+ n_ctx = 16384
257
+ elif config["norm_eps"] == 1e-05:
258
+ # LLaMA v2
259
+ n_ctx = 4096
260
+ else:
261
+ # LLaMA v1
262
+ n_ctx = 2048
263
 
264
  if n_vocab == -1:
265
  n_vocab = model["tok_embeddings.weight"].shape[0]
266
 
267
+ if n_ff == -1:
268
+ n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
269
+
270
  return Params(
271
+ n_vocab = n_vocab,
272
+ n_embd = n_embd,
273
+ n_layer = n_layer,
274
+ n_ctx = n_ctx,
275
+ n_ff = n_ff,
276
+ n_head = n_head,
277
+ n_head_kv = n_head_kv,
278
+ f_norm_eps = f_norm_eps,
279
+ f_rope_freq_base = f_rope_freq_base,
280
  )
281
 
282
  @staticmethod
283
+ def load(model_plus: ModelPlus) -> Params:
284
  hf_config_path = model_plus.paths[0].parent / "config.json"
285
  orig_config_path = model_plus.paths[0].parent / "params.json"
286
 
 
288
  params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
289
  elif orig_config_path.exists():
290
  params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
291
+ elif model_plus.format != 'none':
292
  params = Params.guessed(model_plus.model)
293
+ else:
294
+ raise ValueError('Cannot guess params when model format is none')
295
+
296
+ params.path_model = model_plus.paths[0].parent
297
 
 
298
  return params
299
 
300
 
301
+ #
302
+ # vocab
303
+ #
304
+
305
+ class BpeVocab:
306
+ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
307
+ self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
308
+ added_tokens: dict[str, int]
309
+ if fname_added_tokens is not None:
310
+ # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
311
+ added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
312
  else:
313
+ # Fall back to trying to find the added tokens in tokenizer.json
314
+ tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
315
+ if not tokenizer_json_file.is_file():
316
+ added_tokens = {}
317
+ else:
318
+ tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
319
+ added_tokens = dict(
320
+ (item['content'], item['id'])
321
+ for item in tokenizer_json.get('added_tokens', [])
322
+ # Added tokens here can be duplicates of the main vocabulary.
323
+ if item['content'] not in self.bpe_tokenizer )
324
+
325
+ vocab_size: int = len(self.bpe_tokenizer)
326
+ expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
327
+ actual_ids = sorted(added_tokens.values())
328
+ if expected_ids != actual_ids:
329
+ expected_end_id = vocab_size + len(actual_ids) - 1
330
+ raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
331
+
332
+ items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
333
+ self.added_tokens_list = [text for (text, idx) in items]
334
+ self.vocab_size_base: int = vocab_size
335
+ self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
336
+ self.fname_tokenizer = fname_tokenizer
337
+ self.fname_added_tokens = fname_added_tokens
338
+
339
+ def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
340
+ tokenizer = self.bpe_tokenizer
341
+ from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
342
+ byte_encoder = tokenization_gpt2.bytes_to_unicode()
343
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
344
+ score = 0.0
345
+ for i, item in enumerate(tokenizer):
346
+ text: bytes = item.encode("utf-8")
347
+ # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
348
+ if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
349
+ if i == 0 and text == b'<unk>':
350
+ toktype = gguf.TokenType.UNKNOWN
351
+ elif i == 1 or i == 2:
352
+ toktype = gguf.TokenType.CONTROL
353
+ elif i >= 3 and text.startswith(b'<0x'):
354
+ toktype = gguf.TokenType.BYTE
355
+ else:
356
+ toktype = gguf.TokenType.NORMAL
357
+ else:
358
+ toktype = gguf.TokenType.NORMAL
359
+ yield text, score, toktype
360
+
361
+ def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
362
+ for text in self.added_tokens_list:
363
+ score = -1000.0
364
+ yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
365
+
366
+ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
367
+ yield from self.bpe_tokens()
368
+ yield from self.added_tokens()
369
+
370
+ def __repr__(self) -> str:
371
+ return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
372
+
373
+
374
+ class SentencePieceVocab:
375
+ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
376
+ self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
377
+ added_tokens: dict[str, int]
378
  if fname_added_tokens is not None:
379
+ added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
380
  else:
381
  added_tokens = {}
382
+
383
+ vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
 
 
384
  expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
385
+ actual_ids = sorted(added_tokens.values())
386
  if expected_ids != actual_ids:
387
  raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
388
+
389
  items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
390
  self.added_tokens_list = [text for (text, idx) in items]
391
  self.vocab_size_base: int = vocab_size
 
393
  self.fname_tokenizer = fname_tokenizer
394
  self.fname_added_tokens = fname_added_tokens
395
 
396
+ def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
397
  tokenizer = self.sentencepiece_tokenizer
398
+ for i in range(tokenizer.vocab_size()):
399
+ piece = tokenizer.id_to_piece(i)
400
+ text: bytes = piece.encode("utf-8")
401
+ score: float = tokenizer.get_score(i)
402
+
403
+ toktype = gguf.TokenType.NORMAL
404
+ if tokenizer.is_unknown(i):
405
+ toktype = gguf.TokenType.UNKNOWN
406
+ if tokenizer.is_control(i):
407
+ toktype = gguf.TokenType.CONTROL
408
+
409
+ # NOTE: I think added_tokens are user defined.
410
+ # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
411
+ # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
412
+
413
+ if tokenizer.is_unused(i):
414
+ toktype = gguf.TokenType.UNUSED
415
+ if tokenizer.is_byte(i):
416
+ toktype = gguf.TokenType.BYTE
417
+
418
+ yield text, score, toktype
419
+
420
+ def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
 
 
 
 
 
421
  for text in self.added_tokens_list:
422
  score = -1000.0
423
+ yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
424
 
425
+ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
426
  yield from self.sentencepiece_tokens()
427
  yield from self.added_tokens()
428
 
429
  def __repr__(self) -> str:
430
  return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
431
 
432
+ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
433
 
434
+ #
435
+ # data loading
436
+ # TODO: reuse (probably move to gguf.py?)
437
+ #
438
 
439
+ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
440
+ #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
441
+ if n_head_kv is not None and n_head != n_head_kv:
442
+ n_head //= n_head_kv
 
 
 
 
 
 
 
 
 
443
  return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
444
  .swapaxes(1, 2)
445
  .reshape(weights.shape))
446
 
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  class Tensor(metaclass=ABCMeta):
449
  data_type: DataType
450
 
451
  @abstractmethod
452
+ def astype(self, data_type: DataType) -> Tensor: ...
453
  @abstractmethod
454
+ def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
455
  @abstractmethod
456
+ def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
457
  @abstractmethod
458
+ def part(self, n_part: int) -> UnquantizedTensor: ...
459
  @abstractmethod
460
+ def to_ggml(self) -> GGMLCompatibleTensor: ...
461
 
462
 
463
+ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
464
  assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
465
  fp32_arr = bf16_arr.astype(np.uint32) << 16
466
  return fp32_arr.view(np.float32)
 
473
  self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
474
 
475
  def astype(self, data_type: DataType) -> Tensor:
476
+ dtype = data_type.dtype
477
  if self.data_type == DT_BF16:
478
  self.ndarray = bf16_to_fp32(self.ndarray)
479
  return UnquantizedTensor(self.ndarray.astype(dtype))
480
 
481
+ def to_ggml(self) -> UnquantizedTensor:
482
  return self
483
 
484
+ def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
485
  r = self.ndarray.shape[0] // 3
486
+ return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
487
 
488
+ def part(self, n_part: int) -> UnquantizedTensor:
489
  r = self.ndarray.shape[0] // 3
490
  return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
491
 
492
+ def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
493
+ return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
494
 
495
 
496
+ def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
497
  tensor = lazy_tensor.load()
498
  assert isinstance(tensor, UnquantizedTensor)
499
 
 
509
  return tensor.ndarray
510
 
511
 
512
+ GGMLCompatibleTensor = UnquantizedTensor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
 
514
 
515
  @dataclass
516
  class LazyTensor:
517
  _load: Callable[[], Tensor]
518
+ shape: list[int]
519
  data_type: DataType
520
  description: str
521
 
522
  def load(self) -> Tensor:
523
  ret = self._load()
524
+ # Should be okay if it maps to the same numpy type?
525
+ assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
526
+ (self.data_type, ret.data_type, self.description)
527
  return ret
528
 
529
+ def astype(self, data_type: DataType) -> LazyTensor:
530
  self.validate_conversion_to(data_type)
531
 
532
  def load() -> Tensor:
 
534
  return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
535
 
536
  def validate_conversion_to(self, data_type: DataType) -> None:
537
+ if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
538
+ raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
 
 
 
 
 
 
 
 
 
 
 
539
 
540
 
541
+ LazyModel: TypeAlias = 'dict[str, LazyTensor]'
542
 
543
 
544
  @dataclass
545
  class ModelPlus:
546
  model: LazyModel
547
+ paths: list[Path] # Where this was read from.
548
+ format: Literal['ggml', 'torch', 'safetensors', 'none']
549
+ vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
550
 
551
 
552
+ def merge_sharded(models: list[LazyModel]) -> LazyModel:
553
  # Original LLaMA models have each file contain one part of each tensor.
554
  # Use a dict instead of a set to preserve order.
555
  names = {name: None for model in models for name in model}
556
 
557
  def convert(name: str) -> LazyTensor:
558
+ lazy_tensors: list[LazyTensor] = [model[name] for model in models]
559
  if len(lazy_tensors) == 1:
560
  # only one file; don't go through this procedure since there might
561
  # be quantized tensors
 
583
  return {name: convert(name) for name in names}
584
 
585
 
586
+ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
587
  formats = set(mp.format for mp in models_plus)
588
  assert len(formats) == 1, "different formats?"
589
  format = formats.pop()
 
606
  return ModelPlus(model, paths, format, vocab)
607
 
608
 
609
+ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
610
  def load() -> Tensor:
611
+ return lazy_tensor.load().permute(n_head, n_head_kv)
612
+ return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
613
 
614
+ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
615
  def load() -> Tensor:
616
+ return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
617
  s = lazy_tensor.shape.copy()
618
  s[0] = s[0] // 3
619
+ return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
620
 
621
  def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
622
  def load() -> Tensor:
 
625
  s[0] = s[0] // 3
626
  return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
 
629
  # Functionality that simulates `torch.load` but where individual tensors are
630
  # only loaded into memory on demand, not all at once.
 
657
  assert isinstance(pid[1], LazyStorageKind)
658
  data_type = pid[1].data_type
659
  filename_stem = pid[2]
660
+ filename = f'{self.data_base_path}/{filename_stem}'
661
  info = self.zip_file.getinfo(filename)
662
 
663
  def load(offset: int, elm_count: int) -> NDArray:
664
+ dtype = data_type.dtype
 
 
665
  fp = self.zip_file.open(info)
666
  fp.seek(offset * dtype.itemsize)
667
  size = elm_count * dtype.itemsize
 
671
  description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
672
  return LazyStorage(load=load, kind=pid[1], description=description)
673
 
674
+ @staticmethod
675
  def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
 
676
  requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
677
  assert isinstance(storage, LazyStorage)
678
 
 
682
  description = f'pickled storage_offset={storage_offset} in {storage.description}'
683
  return LazyTensor(load, list(size), storage.kind.data_type, description)
684
 
685
+ @staticmethod
686
  def rebuild_from_type_v2(func, new_type, args, state):
687
  return func(*args)
688
 
689
+ CLASSES: dict[tuple[str, str], Any] = {
690
+ # getattr used here as a workaround for mypy not being smart enough to detrmine
691
+ # the staticmethods have a __func__ attribute.
692
+ ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
693
+ ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
694
  ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
695
  ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
696
  ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
 
717
  return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
718
 
719
 
 
 
 
 
 
 
 
 
720
  def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
721
  header_size, = struct.unpack('<Q', fp.read(8))
722
+ header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
723
  # Use mmap for the actual data to avoid race conditions with the file offset.
724
  mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
725
  byte_buf = mapped[8 + header_size:]
726
 
727
+ def convert(info: dict[str, Any]) -> LazyTensor:
728
  data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
729
+ numpy_dtype = data_type.dtype
730
+ shape: list[int] = info['shape']
731
  begin, end = info['data_offsets']
732
  assert 0 <= begin <= end <= len(byte_buf)
733
  assert end - begin == math.prod(shape) * numpy_dtype.itemsize
 
748
  return ret
749
 
750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
  @functools.lru_cache(maxsize=None)
752
  def lazy_load_file(path: Path) -> ModelPlus:
753
  fp = open(path, 'rb')
 
756
  if first8[:2] == b'PK':
757
  # A zip file, i.e. PyTorch format
758
  return lazy_load_torch_file(fp, path)
 
 
 
759
  elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
760
  # Probably safetensors
761
  return lazy_load_safetensors_file(fp, path)
 
766
  In = TypeVar('In')
767
  Out = TypeVar('Out')
768
 
769
+ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
 
770
  '''Parallel map, but with backpressure. If the caller doesn't call `next`
771
  fast enough, this will stop calling `func` at some point rather than
772
  letting results pile up in memory. Specifically, there is a max of one
773
  output value buffered per thread.'''
774
+ if concurrency < 2:
775
+ yield from map(func, iterable)
776
+ # Not reached.
777
+ iterable = iter(iterable)
778
+ executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
779
+ if use_processpool_executor:
780
+ executor_class = ProcessPoolExecutor
781
+ else:
782
+ executor_class = ThreadPoolExecutor
783
+ with executor_class(max_workers = max_workers) as executor:
784
+ futures: list[concurrent.futures.Future[Out]] = []
785
+ done = False
786
+ for _ in range(concurrency):
787
+ try:
788
+ futures.append(executor.submit(func, next(iterable)))
789
+ except StopIteration:
790
+ done = True
791
+ break
792
+
793
  while futures:
794
  result = futures.pop(0).result()
795
+ while not done and len(futures) < concurrency:
796
+ try:
797
+ futures.append(executor.submit(func, next(iterable)))
798
+ except StopIteration:
799
+ done = True
800
+ break
801
  yield result
802
 
 
803
  def check_vocab_size(params: Params, vocab: Vocab) -> None:
804
  if params.n_vocab != vocab.vocab_size:
805
+ assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
 
806
  if params.n_vocab == vocab.vocab_size_base:
807
  print("Ignoring added_tokens.json since model matches vocab size without it.")
808
  vocab.added_tokens_list = []
 
819
 
820
  class OutputFile:
821
  def __init__(self, fname_out: Path) -> None:
822
+ self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
823
+
824
+ def add_meta_arch(self, params: Params) -> None:
825
+ name = "LLaMA"
826
+
827
+ # TODO: better logic to determine model name
828
+ if params.n_ctx == 4096:
829
+ name = "LLaMA v2"
830
+ elif params.path_model is not None:
831
+ name = str(params.path_model.parent).split('/')[-1]
832
+
833
+ self.gguf.add_name (name)
834
+ self.gguf.add_context_length (params.n_ctx)
835
+ self.gguf.add_embedding_length (params.n_embd)
836
+ self.gguf.add_block_count (params.n_layer)
837
+ self.gguf.add_feed_forward_length (params.n_ff)
838
+ self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
839
+ self.gguf.add_head_count (params.n_head)
840
+ self.gguf.add_head_count_kv (params.n_head_kv)
841
+ self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
842
+
843
+ if params.f_rope_freq_base is not None:
844
+ self.gguf.add_rope_freq_base(params.f_rope_freq_base)
845
+
846
+ if params.f_rope_scale is not None:
847
+ self.gguf.add_rope_scale_linear(params.f_rope_scale)
848
+
849
+ if params.ftype is not None:
850
+ self.gguf.add_file_type(params.ftype)
851
+
852
+ def add_meta_vocab(self, vocab: Vocab) -> None:
853
+ tokens = []
854
+ scores = []
855
+ toktypes = []
856
+ # NOTE: `all_tokens` returns the base vocabulary and added tokens
857
+ for text, score, toktype in vocab.all_tokens():
858
+ tokens.append(text)
859
+ scores.append(score)
860
+ toktypes.append(toktype)
861
+
862
+ if isinstance(vocab, SentencePieceVocab):
863
+ self.gguf.add_tokenizer_model("llama")
864
+ elif isinstance(vocab, BpeVocab):
865
+ self.gguf.add_tokenizer_model("gpt2")
866
+ else:
867
+ raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
868
+ self.gguf.add_token_list(tokens)
869
+ self.gguf.add_token_scores(scores)
870
+ self.gguf.add_token_types(toktypes)
871
+
872
+ def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
873
+ svocab.add_to_gguf(self.gguf)
874
+
875
+ def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
876
+ n_elements = int(np.prod(tensor.shape))
877
+ raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
878
+ data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
879
+ data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
880
+ self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
881
+
882
+ def write_meta(self) -> None:
883
+ self.gguf.write_header_to_file()
884
+ self.gguf.write_kv_data_to_file()
885
+
886
+ def write_tensor_info(self) -> None:
887
+ self.gguf.write_ti_data_to_file()
888
+
889
+ def close(self) -> None:
890
+ self.gguf.close()
891
 
892
  @staticmethod
893
+ def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
894
+ check_vocab_size(params, vocab)
895
+
896
  of = OutputFile(fname_out)
897
+
898
+ # meta data
899
+ of.add_meta_arch(params)
900
+ of.add_meta_vocab(vocab)
901
+ of.add_meta_special_vocab(svocab)
902
+
903
+ of.write_meta()
904
+
905
+ of.close()
906
+
907
+ @staticmethod
908
+ def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
909
+ name, lazy_tensor = item
910
+ tensor = lazy_tensor.load().to_ggml()
911
+ return (lazy_tensor.data_type, tensor.ndarray)
912
 
913
  @staticmethod
914
+ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
915
+ dt, arr = item
916
+ if not isinstance(dt, QuantizedDataType):
917
+ return arr
918
+ return dt.quantize(arr)
919
+
920
+ @staticmethod
921
+ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
922
  check_vocab_size(params, vocab)
923
+
924
  of = OutputFile(fname_out)
 
 
 
925
 
926
+ # meta data
927
+ of.add_meta_arch(params)
928
+ of.add_meta_vocab(vocab)
929
+ of.add_meta_special_vocab(svocab)
930
+
931
+ # tensor info
932
+ for name, lazy_tensor in model.items():
933
+ of.add_tensor_info(name, lazy_tensor)
934
+
935
+ of.write_meta()
936
+ of.write_tensor_info()
937
+
938
+ # tensor data
939
+ ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
940
+ if ftype == GGMLFileType.MostlyQ8_0:
941
+ ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
942
+ else:
943
+ ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
944
 
945
+ start = time.time()
946
  for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
947
+ elapsed = time.time() - start
948
  size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
949
  padi = len(str(len(model)))
950
+ print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
951
+ of.gguf.write_tensor_data(ndarray)
 
 
952
 
953
+ of.close()
954
 
955
+ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
956
+ wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
957
+
958
+ if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
959
  return GGMLFileType.AllF32
960
+ if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
961
  return GGMLFileType.MostlyF16
962
+ if output_type_str == "q8_0":
963
+ return GGMLFileType.MostlyQ8_0
964
+
 
 
 
 
 
965
  name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
966
+
967
  raise Exception(f"Unexpected combination of types: {name_to_type}")
968
 
969
+ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
970
+ return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
971
+ for (name, tensor) in model.items()}
972
 
973
+ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
974
+ tmap = gguf.TensorNameMap(ARCH, params.n_layer)
975
+ should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
976
 
977
+ tmp = model
 
 
978
 
979
+ # HF models permut or pack some of the tensors, so we need to undo that
980
+ for i in itertools.count():
981
+ if f"model.layers.{i}.self_attn.q_proj.weight" in model:
982
+ print(f"Permuting layer {i}")
983
+ tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
984
+ tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
985
+ #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
986
+ elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
987
+ print(f"Unpacking and permuting layer {i}")
988
+ tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
989
+ tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
990
+ tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
991
+ del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
992
+ else:
993
+ break
994
 
995
+ out: LazyModel = {}
996
+ for name, lazy_tensor in model.items():
997
+ tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
998
+ if name_new is None:
999
+ raise Exception(f"Unexpected tensor name: {name}")
1000
 
1001
+ if tensor_type in should_skip:
1002
+ print(f"skipping tensor {name_new}")
1003
+ continue
1004
 
1005
+ print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
1006
+ out[name_new] = lazy_tensor
1007
 
1008
+ return out
1009
+
1010
+ def nth_multifile_path(path: Path, n: int) -> Path | None:
1011
  '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1012
  the nth path in the model.
1013
  '''
1014
  # Support the following patterns:
1015
+ patterns: list[tuple[str, str]] = [
1016
  # - x.00.pth, x.01.pth, etc.
1017
  (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
1018
  # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
 
1028
  return None
1029
 
1030
 
1031
+ def find_multifile_paths(path: Path) -> list[Path]:
1032
  '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1033
  the whole list of paths in the model.
1034
  '''
1035
+ ret: list[Path] = []
1036
  for i in itertools.count():
1037
  nth_path = nth_multifile_path(path, i)
1038
  if nth_path is None:
 
1056
  # Try the PyTorch patterns too, with lower priority
1057
  globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
1058
  files = [file for glob in globs for file in path.glob(glob)]
 
 
 
 
 
1059
  if not files:
1060
  raise Exception(f"Can't find model in directory {path}")
1061
  if len(files) > 1:
 
1063
  path = files[0]
1064
 
1065
  paths = find_multifile_paths(path)
1066
+ models_plus: list[ModelPlus] = []
1067
  for path in paths:
1068
  print(f"Loading model file {path}")
1069
  models_plus.append(lazy_load_file(path))
 
1072
  return model_plus
1073
 
1074
 
1075
+ def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
 
 
 
 
 
1076
  # Be extra-friendly and accept either a file or a directory. Also, if it's
1077
  # a directory, it might be the model directory, and tokenizer.model might
1078
  # be in the parent of that.
1079
  if path.is_dir():
1080
  vocab_file = "tokenizer.model"
1081
  if vocabtype == 'bpe':
1082
+ vocab_file = "vocab.json"
1083
  path2 = path / vocab_file
1084
  # Use `.parent` instead of /.. to handle the symlink case better.
1085
  path3 = path.parent / vocab_file
 
1089
  path = path3
1090
  else:
1091
  raise FileNotFoundError(
1092
+ f"Could not find {vocab_file} in {path} or its parent; "
1093
  "if it's in another directory, pass the directory as --vocab-dir")
1094
+
1095
+ print(f"Loading vocab file '{path}', type '{vocabtype}'")
1096
+
1097
  added_tokens_path = path.parent / "added_tokens.json"
1098
+ if vocabtype == "bpe":
1099
+ return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1100
+ elif vocabtype == "spm":
1101
+ return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1102
+ else:
1103
+ raise ValueError(f"Unsupported vocabulary type {vocabtype}")
1104
 
1105
 
1106
+ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
1107
  namestr = {
1108
+ GGMLFileType.AllF32: "f32",
1109
  GGMLFileType.MostlyF16: "f16",
1110
+ GGMLFileType.MostlyQ8_0:"q8_0",
 
 
1111
  }[file_type]
1112
+ ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
1113
  if ret in model_paths:
1114
  sys.stderr.write(
1115
  f"Error: Default output path ({ret}) would overwrite the input. "
 
1126
  print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
1127
 
1128
 
1129
+ def main(args_in: list[str] | None = None) -> None:
1130
  parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
1131
+ parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
1132
+ parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
1133
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
1134
+ parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
1135
+ parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1136
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
1137
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1138
+ parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
1139
+ parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
1140
+ parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
1141
  args = parser.parse_args(args_in)
1142
 
 
1143
  if args.dump_single:
1144
  model_plus = lazy_load_file(args.model)
1145
  do_dump_model(model_plus)
1146
+ return
1147
+
1148
+ if not args.vocab_only:
1149
+ model_plus = load_some_model(args.model)
1150
+ else:
1151
+ model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
1152
+
1153
+ if args.dump:
1154
+ do_dump_model(model_plus)
1155
+ return
1156
+
1157
+ params = Params.load(model_plus)
1158
+ if params.n_ctx == -1:
1159
+ if args.ctx is None:
1160
+ raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
1161
+ "Please specify one with --ctx:\n"
1162
+ " - LLaMA v1: --ctx 2048\n"
1163
+ " - LLaMA v2: --ctx 4096\n")
1164
+ params.n_ctx = args.ctx
1165
+
1166
+ if args.outtype:
1167
+ params.ftype = {
1168
+ "f32": GGMLFileType.AllF32,
1169
+ "f16": GGMLFileType.MostlyF16,
1170
+ "q8_0": GGMLFileType.MostlyQ8_0,
1171
+ }[args.outtype]
1172
+
1173
+ print(f"params = {params}")
1174
+
1175
+ vocab: Vocab
1176
+ if args.vocab_only:
1177
  assert args.outfile, "need --outfile if using --vocab-only"
1178
+ # FIXME: Try to respect vocab_dir somehow?
1179
+ vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
1180
+ special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
1181
  outfile = args.outfile
1182
+ OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
1183
  print(f"Wrote {outfile}")
1184
+ return
1185
+
1186
+ if model_plus.vocab is not None and args.vocab_dir is None:
1187
+ vocab = model_plus.vocab
1188
  else:
1189
+ vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
1190
+ vocab = load_vocab(vocab_dir, args.vocabtype)
1191
+ # FIXME: Try to respect vocab_dir somehow?
1192
+ special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
1193
+
1194
+ model = model_plus.model
1195
+ model = convert_model_names(model, params)
1196
+ ftype = pick_output_type(model, args.outtype)
1197
+ model = convert_to_output_type(model, ftype)
1198
+ outfile = args.outfile or default_outfile(model_plus.paths, ftype)
1199
+
1200
+ params.ftype = ftype
1201
+ print(f"Writing {outfile}, format {ftype}")
1202
+
1203
+ OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
1204
+ print(f"Wrote {outfile}")
 
1205
 
1206
 
1207
  if __name__ == '__main__':
docs/token_generation_performance_tips.md CHANGED
@@ -3,7 +3,7 @@
3
  ## Verifying that the model is running on the GPU with cuBLAS
4
  Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
5
  ```shell
6
- ./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
7
  ```
8
 
9
  When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
25
  CPU: 7 physical cores
26
  RAM: 32GB
27
 
28
- Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
29
 
30
- Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
31
 
32
  Result:
33
 
 
3
  ## Verifying that the model is running on the GPU with cuBLAS
4
  Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
5
  ```shell
6
+ ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
7
  ```
8
 
9
  When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
 
25
  CPU: 7 physical cores
26
  RAM: 32GB
27
 
28
+ Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
29
 
30
+ Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
31
 
32
  Result:
33
 
examples/CMakeLists.txt CHANGED
@@ -6,27 +6,6 @@ find_package(Threads REQUIRED)
6
 
7
  # ...
8
 
9
- # common
10
-
11
- set(TARGET common)
12
-
13
- add_library(${TARGET} OBJECT
14
- common.h
15
- common.cpp
16
- console.h
17
- console.cpp
18
- grammar-parser.h
19
- grammar-parser.cpp
20
- )
21
-
22
- if (BUILD_SHARED_LIBS)
23
- set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
24
- endif()
25
-
26
- target_include_directories(${TARGET} PUBLIC .)
27
- target_compile_features(${TARGET} PUBLIC cxx_std_11)
28
- target_link_libraries(${TARGET} PRIVATE llama)
29
-
30
  # examples
31
 
32
  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@@ -42,8 +21,12 @@ else()
42
  add_subdirectory(benchmark)
43
  add_subdirectory(baby-llama)
44
  add_subdirectory(train-text-from-scratch)
 
45
  add_subdirectory(simple)
 
46
  add_subdirectory(embd-input)
 
 
47
  if (LLAMA_METAL)
48
  add_subdirectory(metal)
49
  endif()
 
6
 
7
  # ...
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # examples
10
 
11
  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
21
  add_subdirectory(benchmark)
22
  add_subdirectory(baby-llama)
23
  add_subdirectory(train-text-from-scratch)
24
+ add_subdirectory(convert-llama2c-to-ggml)
25
  add_subdirectory(simple)
26
+ add_subdirectory(speculative)
27
  add_subdirectory(embd-input)
28
+ add_subdirectory(llama-bench)
29
+ add_subdirectory(beam-search)
30
  if (LLAMA_METAL)
31
  add_subdirectory(metal)
32
  endif()
examples/baby-llama/baby-llama.cpp CHANGED
@@ -9,12 +9,12 @@
9
  #endif
10
 
11
  #ifdef LLAMA_DEFAULT_RMS_EPS
12
- static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
13
  #else
14
- static const float rms_norm_eps = 5e-6f;
15
  #endif
16
 
17
- float frand() {
18
  return (float)rand()/(float)RAND_MAX;
19
  }
20
 
@@ -25,19 +25,21 @@ struct random_normal_distribution {
25
  float max;
26
  };
27
 
28
- void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
 
 
29
  rnd->gen = std::mt19937(seed);
30
  rnd->nd = std::normal_distribution<float>{mean, std};
31
  rnd->min = min;
32
  rnd->max = max;
33
  }
34
 
35
- float frand_normal(struct random_normal_distribution * rnd) {
36
  const float r = rnd->nd(rnd->gen);
37
  return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
38
  }
39
 
40
- void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
41
  struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
42
 
43
  if (plan.work_size > 0) {
@@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
48
  ggml_graph_compute(graph, &plan);
49
  }
50
 
51
- struct ggml_tensor * randomize_tensor(
52
- struct ggml_tensor * tensor,
53
- int ndims,
54
- const int64_t ne[],
55
- float fmin,
56
- float fmax) {
57
-
58
  switch (ndims) {
59
  case 1:
60
  for (int i0 = 0; i0 < ne[0]; i0++) {
@@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
95
  return tensor;
96
  }
97
 
98
- struct ggml_tensor * randomize_tensor_normal(
99
- struct ggml_tensor * tensor,
100
- int ndims,
101
- const int64_t ne[],
102
- struct random_normal_distribution * rnd) {
103
  float scale = 1.0; // xavier
104
  switch (ndims) {
105
  case 1:
@@ -159,7 +155,7 @@ struct llama_hparams {
159
  }
160
  };
161
 
162
- uint32_t get_n_ff(const struct llama_hparams* hparams) {
163
  const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
164
  return n_ff;
165
  }
@@ -260,7 +256,7 @@ struct llama_model_lora {
260
  std::vector<llama_layer_lora> layers;
261
  };
262
 
263
- void init_model(struct llama_model * model) {
264
  const auto & hparams = model->hparams;
265
 
266
  const uint32_t n_embd = hparams.n_embd;
@@ -297,7 +293,7 @@ void init_model(struct llama_model * model) {
297
  }
298
 
299
 
300
- void init_model_lora(struct llama_model_lora * model) {
301
  const auto & hparams = model->hparams;
302
 
303
  const uint32_t n_embd = hparams.n_embd;
@@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
340
  }
341
  }
342
 
343
- void set_param_model(struct llama_model * model) {
344
  const auto& hparams = model->hparams;
345
 
346
  const uint32_t n_layer = hparams.n_layer;
@@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) {
366
  }
367
  }
368
 
369
- void set_param_model_lora(struct llama_model_lora * model) {
370
  const auto& hparams = model->hparams;
371
 
372
  const uint32_t n_layer = hparams.n_layer;
@@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
397
  }
398
  }
399
 
400
- void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
401
  const auto & hparams = model->hparams;
402
 
403
  const uint32_t n_layer = hparams.n_layer;
@@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
426
  }
427
 
428
 
429
- void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
 
 
430
  const auto & hparams = model->hparams;
431
 
432
  const uint32_t n_layer = hparams.n_layer;
@@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
459
  }
460
  }
461
 
462
- bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
463
  const auto & hparams = model->hparams;
464
 
465
  const uint32_t n_ctx = hparams.n_ctx;
@@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
495
  return true;
496
  }
497
 
498
- bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
499
  const auto & hparams = model->hparams;
500
 
501
  const uint32_t n_ctx = hparams.n_ctx;
@@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
531
  return true;
532
  }
533
 
534
- struct ggml_tensor * forward(
535
- struct llama_model * model,
536
- struct llama_kv_cache * cache,
537
- struct ggml_context * ctx0,
538
- struct ggml_cgraph * gf,
539
- struct ggml_tensor * tokens_input,
540
- const int n_tokens,
541
- const int n_past) {
542
-
543
  const int N = n_tokens;
544
 
545
  struct llama_kv_cache& kv_self = *cache;
@@ -756,25 +754,25 @@ struct ggml_tensor * forward(
756
  return inpL;
757
  }
758
 
759
- void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
760
  GGML_ASSERT(tensor->n_dims == 1);
761
  GGML_ASSERT(tensor->ne[0] == ne0);
762
  }
763
 
764
- void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
765
  GGML_ASSERT(tensor->n_dims == 2);
766
  GGML_ASSERT(tensor->ne[0] == ne0);
767
  GGML_ASSERT(tensor->ne[1] == ne1);
768
  }
769
 
770
- void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
771
  GGML_ASSERT(tensor->n_dims == 3);
772
  GGML_ASSERT(tensor->ne[0] == ne0);
773
  GGML_ASSERT(tensor->ne[1] == ne1);
774
  GGML_ASSERT(tensor->ne[2] == ne2);
775
  }
776
 
777
- void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
778
  GGML_ASSERT(tensor->n_dims == 4);
779
  GGML_ASSERT(tensor->ne[0] == ne0);
780
  GGML_ASSERT(tensor->ne[1] == ne1);
@@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
782
  GGML_ASSERT(tensor->ne[3] == ne3);
783
  }
784
 
785
- struct ggml_tensor * forward_batch(
786
- struct llama_model * model,
787
- struct llama_kv_cache * cache,
788
- struct ggml_context * ctx0,
789
- struct ggml_cgraph * gf,
790
- struct ggml_tensor * tokens_input,
791
- const int n_tokens,
792
- const int n_past,
793
- const int n_batch) {
794
-
795
  const int N = n_tokens;
796
 
797
  struct llama_kv_cache& kv_self = *cache;
@@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch(
1073
  return inpL;
1074
  }
1075
 
1076
-
1077
- struct ggml_tensor * forward_lora(
1078
- struct llama_model_lora * model,
1079
- struct llama_kv_cache * cache,
1080
- struct ggml_context * ctx0,
1081
- struct ggml_cgraph * gf,
1082
- struct ggml_tensor * tokens_input,
1083
- const int n_tokens,
1084
- const int n_past) {
1085
-
1086
  const int N = n_tokens;
1087
 
1088
  struct llama_kv_cache& kv_self = *cache;
@@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora(
1328
  return inpL;
1329
  }
1330
 
1331
- void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
1332
  assert(logits->n_dims == 2);
1333
  assert(probs->n_dims == 2);
1334
  assert(best_samples->n_dims == 1);
@@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
1359
  }
1360
  }
1361
 
1362
- void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
 
 
 
1363
  GGML_ASSERT(best_samples->n_dims == 2);
1364
  GGML_ASSERT(logits->n_dims == 3);
1365
  GGML_ASSERT(probs->n_dims == 3);
@@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
1393
  }
1394
  }
1395
 
1396
- void print_row(struct ggml_tensor * probs, int i) {
1397
  for (int k = 0; k < probs->ne[0]; ++k) {
1398
  float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
1399
  printf(" %.2f", p);
@@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) {
1401
  printf("\n");
1402
  }
1403
 
1404
- void print_matrix(struct ggml_tensor * probs) {
1405
  assert(probs->n_dims == 2);
1406
  for (int i = 0; i < probs->ne[1]; ++i) {
1407
  for (int k = 0; k < probs->ne[0]; ++k) {
@@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) {
1412
  }
1413
  }
1414
 
1415
- void print_token(int token, int n_vocab) {
1416
  for (int k = 0; k < token; ++k) {
1417
  printf(" ");
1418
  }
@@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) {
1423
  printf("\n");
1424
  }
1425
 
1426
- void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
1427
  for (int i=0; i<tokens->ne[0]; ++i) {
1428
  int token = ggml_get_i32_1d(tokens, i);
1429
  print_token(token, n_vocab);
1430
  }
1431
  }
1432
 
1433
- void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
1434
  int n_tokens = tokens_input->ne[0];
1435
  int n_vocab = targets->ne[0];
1436
  float randomness = 0.0f;
@@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
1451
  }
1452
  }
1453
 
1454
- void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
 
 
1455
  GGML_ASSERT(tokens_input->n_dims == 2);
1456
  GGML_ASSERT( targets->n_dims == 3);
1457
  int n_tokens = tokens_input->ne[0];
@@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
1474
  }
1475
  }
1476
 
1477
- void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
1478
  int n_tokens = tokens_input->ne[0];
1479
  int n_vocab = targets->ne[0];
1480
  for (int i=0; i<n_tokens-n_shift; ++i) {
@@ -1485,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
1485
  }
1486
  }
1487
 
1488
- struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
 
 
1489
  // todo: instead of a-b: a[1:]-b[:-1]
1490
  return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
1491
  }
1492
 
1493
- struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
 
 
1494
  const float eps = 1e-3f;
1495
  return
1496
  ggml_sum(ctx,
@@ -1617,15 +1623,10 @@ int main(int argc, char ** argv) {
1617
 
1618
  float error_before_opt = ggml_get_f32_1d(e, 0);
1619
 
1620
- struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
1621
  struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
1622
- opt_params_adam.print_forward_graph = false;
1623
- opt_params_adam.print_backward_graph = false;
1624
  opt_params_lbfgs.print_forward_graph = false;
1625
  opt_params_lbfgs.print_backward_graph = false;
1626
- opt_params_adam.adam.n_iter = 16;
1627
  opt_params_lbfgs.lbfgs.n_iter = 16;
1628
- // ggml_opt(ctx0, opt_params_adam, e);
1629
  ggml_opt(ctx0, opt_params_lbfgs, e);
1630
  //
1631
  ggml_build_forward_expand(&gf, e);
 
9
  #endif
10
 
11
  #ifdef LLAMA_DEFAULT_RMS_EPS
12
+ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
13
  #else
14
+ constexpr float rms_norm_eps = 5e-6f;
15
  #endif
16
 
17
+ static float frand() {
18
  return (float)rand()/(float)RAND_MAX;
19
  }
20
 
 
25
  float max;
26
  };
27
 
28
+ static void init_random_normal_distribution(
29
+ struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
30
+ ) {
31
  rnd->gen = std::mt19937(seed);
32
  rnd->nd = std::normal_distribution<float>{mean, std};
33
  rnd->min = min;
34
  rnd->max = max;
35
  }
36
 
37
+ static float frand_normal(struct random_normal_distribution * rnd) {
38
  const float r = rnd->nd(rnd->gen);
39
  return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
40
  }
41
 
42
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
43
  struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
44
 
45
  if (plan.work_size > 0) {
 
50
  ggml_graph_compute(graph, &plan);
51
  }
52
 
53
+ static struct ggml_tensor * randomize_tensor(
54
+ struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
55
+ ) {
 
 
 
 
56
  switch (ndims) {
57
  case 1:
58
  for (int i0 = 0; i0 < ne[0]; i0++) {
 
93
  return tensor;
94
  }
95
 
96
+ static struct ggml_tensor * randomize_tensor_normal(
97
+ struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
98
+ ) {
 
 
99
  float scale = 1.0; // xavier
100
  switch (ndims) {
101
  case 1:
 
155
  }
156
  };
157
 
158
+ static uint32_t get_n_ff(const struct llama_hparams* hparams) {
159
  const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
160
  return n_ff;
161
  }
 
256
  std::vector<llama_layer_lora> layers;
257
  };
258
 
259
+ static void init_model(struct llama_model * model) {
260
  const auto & hparams = model->hparams;
261
 
262
  const uint32_t n_embd = hparams.n_embd;
 
293
  }
294
 
295
 
296
+ static void init_model_lora(struct llama_model_lora * model) {
297
  const auto & hparams = model->hparams;
298
 
299
  const uint32_t n_embd = hparams.n_embd;
 
336
  }
337
  }
338
 
339
+ static void set_param_model(struct llama_model * model) {
340
  const auto& hparams = model->hparams;
341
 
342
  const uint32_t n_layer = hparams.n_layer;
 
362
  }
363
  }
364
 
365
+ static void set_param_model_lora(struct llama_model_lora * model) {
366
  const auto& hparams = model->hparams;
367
 
368
  const uint32_t n_layer = hparams.n_layer;
 
393
  }
394
  }
395
 
396
+ static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
397
  const auto & hparams = model->hparams;
398
 
399
  const uint32_t n_layer = hparams.n_layer;
 
422
  }
423
 
424
 
425
+ static void randomize_model_lora(
426
+ struct llama_model_lora * model, int seed, float mean, float std, float min, float max
427
+ ) {
428
  const auto & hparams = model->hparams;
429
 
430
  const uint32_t n_layer = hparams.n_layer;
 
457
  }
458
  }
459
 
460
+ static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
461
  const auto & hparams = model->hparams;
462
 
463
  const uint32_t n_ctx = hparams.n_ctx;
 
493
  return true;
494
  }
495
 
496
+ static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
497
  const auto & hparams = model->hparams;
498
 
499
  const uint32_t n_ctx = hparams.n_ctx;
 
529
  return true;
530
  }
531
 
532
+ static struct ggml_tensor * forward(
533
+ struct llama_model * model,
534
+ struct llama_kv_cache * cache,
535
+ struct ggml_context * ctx0,
536
+ struct ggml_cgraph * gf,
537
+ struct ggml_tensor * tokens_input,
538
+ const int n_tokens,
539
+ const int n_past
540
+ ) {
541
  const int N = n_tokens;
542
 
543
  struct llama_kv_cache& kv_self = *cache;
 
754
  return inpL;
755
  }
756
 
757
+ static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
758
  GGML_ASSERT(tensor->n_dims == 1);
759
  GGML_ASSERT(tensor->ne[0] == ne0);
760
  }
761
 
762
+ static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
763
  GGML_ASSERT(tensor->n_dims == 2);
764
  GGML_ASSERT(tensor->ne[0] == ne0);
765
  GGML_ASSERT(tensor->ne[1] == ne1);
766
  }
767
 
768
+ static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
769
  GGML_ASSERT(tensor->n_dims == 3);
770
  GGML_ASSERT(tensor->ne[0] == ne0);
771
  GGML_ASSERT(tensor->ne[1] == ne1);
772
  GGML_ASSERT(tensor->ne[2] == ne2);
773
  }
774
 
775
+ static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
776
  GGML_ASSERT(tensor->n_dims == 4);
777
  GGML_ASSERT(tensor->ne[0] == ne0);
778
  GGML_ASSERT(tensor->ne[1] == ne1);
 
780
  GGML_ASSERT(tensor->ne[3] == ne3);
781
  }
782
 
783
+ static struct ggml_tensor * forward_batch(
784
+ struct llama_model * model,
785
+ struct llama_kv_cache * cache,
786
+ struct ggml_context * ctx0,
787
+ struct ggml_cgraph * gf,
788
+ struct ggml_tensor * tokens_input,
789
+ const int n_tokens,
790
+ const int n_past,
791
+ const int n_batch
792
+ ) {
793
  const int N = n_tokens;
794
 
795
  struct llama_kv_cache& kv_self = *cache;
 
1071
  return inpL;
1072
  }
1073
 
1074
+ static struct ggml_tensor * forward_lora(
1075
+ struct llama_model_lora * model,
1076
+ struct llama_kv_cache * cache,
1077
+ struct ggml_context * ctx0,
1078
+ struct ggml_cgraph * gf,
1079
+ struct ggml_tensor * tokens_input,
1080
+ const int n_tokens,
1081
+ const int n_past
1082
+ ) {
 
1083
  const int N = n_tokens;
1084
 
1085
  struct llama_kv_cache& kv_self = *cache;
 
1325
  return inpL;
1326
  }
1327
 
1328
+ static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
1329
  assert(logits->n_dims == 2);
1330
  assert(probs->n_dims == 2);
1331
  assert(best_samples->n_dims == 1);
 
1356
  }
1357
  }
1358
 
1359
+ static void sample_softmax_batch(
1360
+ struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
1361
+ struct ggml_tensor * best_samples
1362
+ ) {
1363
  GGML_ASSERT(best_samples->n_dims == 2);
1364
  GGML_ASSERT(logits->n_dims == 3);
1365
  GGML_ASSERT(probs->n_dims == 3);
 
1393
  }
1394
  }
1395
 
1396
+ static void print_row(struct ggml_tensor * probs, int i) {
1397
  for (int k = 0; k < probs->ne[0]; ++k) {
1398
  float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
1399
  printf(" %.2f", p);
 
1401
  printf("\n");
1402
  }
1403
 
1404
+ static void print_matrix(struct ggml_tensor * probs) {
1405
  assert(probs->n_dims == 2);
1406
  for (int i = 0; i < probs->ne[1]; ++i) {
1407
  for (int k = 0; k < probs->ne[0]; ++k) {
 
1412
  }
1413
  }
1414
 
1415
+ static void print_token(int token, int n_vocab) {
1416
  for (int k = 0; k < token; ++k) {
1417
  printf(" ");
1418
  }
 
1423
  printf("\n");
1424
  }
1425
 
1426
+ static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
1427
  for (int i=0; i<tokens->ne[0]; ++i) {
1428
  int token = ggml_get_i32_1d(tokens, i);
1429
  print_token(token, n_vocab);
1430
  }
1431
  }
1432
 
1433
+ static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
1434
  int n_tokens = tokens_input->ne[0];
1435
  int n_vocab = targets->ne[0];
1436
  float randomness = 0.0f;
 
1451
  }
1452
  }
1453
 
1454
+ static void get_example_targets_batch(
1455
+ struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
1456
+ ) {
1457
  GGML_ASSERT(tokens_input->n_dims == 2);
1458
  GGML_ASSERT( targets->n_dims == 3);
1459
  int n_tokens = tokens_input->ne[0];
 
1476
  }
1477
  }
1478
 
1479
+ static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
1480
  int n_tokens = tokens_input->ne[0];
1481
  int n_vocab = targets->ne[0];
1482
  for (int i=0; i<n_tokens-n_shift; ++i) {
 
1487
  }
1488
  }
1489
 
1490
+ static struct ggml_tensor * square_error_loss(
1491
+ struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
1492
+ ) {
1493
  // todo: instead of a-b: a[1:]-b[:-1]
1494
  return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
1495
  }
1496
 
1497
+ static struct ggml_tensor * cross_entropy_loss(
1498
+ struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
1499
+ ) {
1500
  const float eps = 1e-3f;
1501
  return
1502
  ggml_sum(ctx,
 
1623
 
1624
  float error_before_opt = ggml_get_f32_1d(e, 0);
1625
 
 
1626
  struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
 
 
1627
  opt_params_lbfgs.print_forward_graph = false;
1628
  opt_params_lbfgs.print_backward_graph = false;
 
1629
  opt_params_lbfgs.lbfgs.n_iter = 16;
 
1630
  ggml_opt(ctx0, opt_params_lbfgs, e);
1631
  //
1632
  ggml_build_forward_expand(&gf, e);
examples/beam-search/CMakeLists.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ set(TARGET beam-search)
2
+ add_executable(${TARGET} beam-search.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
examples/beam-search/beam-search.cpp ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.h"
2
+ #include "llama.h"
3
+
4
+ #include <cassert>
5
+ #include <cinttypes>
6
+ #include <cmath>
7
+ #include <cstdio>
8
+ #include <cstring>
9
+ #include <ctime>
10
+ #include <fstream>
11
+ #include <iostream>
12
+ #include <string>
13
+ #include <vector>
14
+
15
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
16
+ #include <signal.h>
17
+ #include <unistd.h>
18
+ #elif defined (_WIN32)
19
+ #define WIN32_LEAN_AND_MEAN
20
+ #ifndef NOMINMAX
21
+ # define NOMINMAX
22
+ #endif
23
+ #include <windows.h>
24
+ #include <signal.h>
25
+ #endif
26
+
27
+ // Used for debugging to print out beam tokens.
28
+ struct ostream_beam_view {
29
+ llama_context * ctx;
30
+ llama_beam_view beam_view;
31
+ };
32
+
33
+ static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
34
+ os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
35
+ for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
36
+ os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
37
+ }
38
+ return os << ')';
39
+ }
40
+
41
+ // Put here anything you want back in beam_search_callback().
42
+ struct beam_search_callback_data {
43
+ llama_context * ctx;
44
+ std::vector<llama_token> response;
45
+ };
46
+
47
+ // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
48
+ // For example, eob can be flagged due to maximum token length, stop words, etc.
49
+ static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
50
+ return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
51
+ }
52
+
53
+ // Function matching type llama_beam_search_callback_fn_t.
54
+ // Custom callback example is called each time the beams lengths increase:
55
+ // * Show progress by printing ',' following by number of convergent beam tokens if any.
56
+ // * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
57
+ // This is also called when the stop condition is met.
58
+ // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
59
+ static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
60
+ auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
61
+ // Mark beams as EOS as needed.
62
+ for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
63
+ llama_beam_view& beam_view = beams_state.beam_views[i];
64
+ if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
65
+ beam_view.eob = true;
66
+ }
67
+ }
68
+ printf(","); // Show progress
69
+ if (const size_t n = beams_state.common_prefix_length) {
70
+ callback_data.response.resize(callback_data.response.size() + n);
71
+ assert(0u < beams_state.n_beams);
72
+ const llama_token * tokens = beams_state.beam_views[0].tokens;
73
+ std::copy(tokens, tokens + n, callback_data.response.end() - n);
74
+ printf("%zu", n);
75
+ }
76
+ fflush(stdout);
77
+ #if 1 // DEBUG: print current beams for this iteration
78
+ std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
79
+ for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
80
+ std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
81
+ }
82
+ #endif
83
+ }
84
+
85
+ int main(int argc, char ** argv)
86
+ {
87
+ gpt_params params;
88
+ //params.n_gpu_layers = 200;
89
+
90
+ //---------------------------------
91
+ // Print help :
92
+ //---------------------------------
93
+
94
+ if ( argc < 2 || argv[1][0] == '-' )
95
+ {
96
+ printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
97
+ return 1 ;
98
+ }
99
+
100
+ //---------------------------------
101
+ // Load parameters :
102
+ //---------------------------------
103
+
104
+ params.model = argv[1];
105
+
106
+ params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
107
+
108
+ if ( argc > 3 )
109
+ {
110
+ params.prompt = argv[3];
111
+ }
112
+
113
+ if ( params.prompt.empty() )
114
+ {
115
+ params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
116
+ }
117
+
118
+ //---------------------------------
119
+ // Init LLM :
120
+ //---------------------------------
121
+
122
+ llama_backend_init(params.numa);
123
+
124
+ llama_model * model;
125
+ llama_context * ctx;
126
+
127
+ std::tie(model, ctx) = llama_init_from_gpt_params( params );
128
+
129
+ if ( model == NULL )
130
+ {
131
+ fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
132
+ return 1;
133
+ }
134
+
135
+ //---------------------------------
136
+ // Tokenize the prompt :
137
+ //---------------------------------
138
+
139
+ std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
140
+
141
+ const size_t max_context_size = llama_n_ctx( ctx );
142
+ const size_t max_tokens_list_size = max_context_size - 4 ;
143
+
144
+ if (tokens_list.size() > max_tokens_list_size)
145
+ {
146
+ fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
147
+ __func__ , tokens_list.size() , max_tokens_list_size );
148
+ return 1;
149
+ }
150
+
151
+ fprintf( stderr, "\n\n" );
152
+
153
+ // Print the tokens from the prompt :
154
+
155
+ for( auto id : tokens_list )
156
+ {
157
+ std::cout << llama_token_to_piece(ctx, id);
158
+ }
159
+ std::cout << std::flush;
160
+
161
+ int n_past = llama_get_kv_cache_token_count(ctx);
162
+ if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
163
+ {
164
+ fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
165
+ return 1;
166
+ }
167
+ n_past += tokens_list.size();
168
+
169
+ beam_search_callback_data callback_data{ctx, {}};
170
+ size_t const beam_width = static_cast<size_t>(params.n_beams);
171
+ int const n_predict = 256;
172
+ llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
173
+
174
+ std::cout << "\n\n";
175
+ for (llama_token const token_id : callback_data.response) {
176
+ std::cout << llama_token_to_piece(ctx,token_id);
177
+ }
178
+ std::cout << std::endl;
179
+
180
+ llama_free( ctx );
181
+ llama_free_model( model );
182
+
183
+ llama_backend_free();
184
+
185
+ return 0;
186
+ }
examples/benchmark/CMakeLists.txt CHANGED
@@ -1,7 +1,8 @@
1
  set(TARGET benchmark)
2
  add_executable(${TARGET} benchmark-matmult.cpp)
3
  install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 
5
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
6
  if(TARGET BUILD_INFO)
7
  add_dependencies(${TARGET} BUILD_INFO)
 
1
  set(TARGET benchmark)
2
  add_executable(${TARGET} benchmark-matmult.cpp)
3
  install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_include_directories(${TARGET} PRIVATE ../../common)
6
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
7
  if(TARGET BUILD_INFO)
8
  add_dependencies(${TARGET} BUILD_INFO)
examples/benchmark/benchmark-matmult.cpp CHANGED
@@ -1,5 +1,6 @@
1
- #include "ggml.h"
2
  #include "build-info.h"
 
 
3
 
4
  #include <locale.h>
5
  #include <assert.h>
@@ -20,7 +21,7 @@
20
  #pragma warning(disable: 4244 4267) // possible loss of data
21
  #endif
22
 
23
- void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
24
  struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
25
 
26
  if (plan.work_size > 0) {
@@ -31,19 +32,19 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
31
  ggml_graph_compute(graph, &plan);
32
  }
33
 
34
- float tensor_sum_elements(const ggml_tensor * tensor) {
35
- float sum = 0;
36
- if (tensor->type==GGML_TYPE_F32) {
37
  for (int j = 0; j < tensor->ne[1]; j++) {
38
  for (int k = 0; k < tensor->ne[0]; k++) {
39
- sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
40
  }
41
  }
42
  }
43
  return sum;
44
  }
45
 
46
- void tensor_dump(const ggml_tensor * tensor, const char * name) {
47
  printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
48
  tensor->type, ggml_type_name(tensor->type),
49
  tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
@@ -58,7 +59,7 @@ struct benchmark_params_struct {
58
  int32_t n_iterations = 10;
59
  };
60
 
61
- void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
62
  fprintf(stderr, "usage: %s [options]\n", argv[0]);
63
  fprintf(stderr, "\n");
64
  fprintf(stderr, "options:\n");
@@ -99,7 +100,7 @@ int main(int argc, char ** argv) {
99
  exit(1);
100
  }
101
 
102
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
103
  printf("Starting Test\n");
104
 
105
  // create the ggml context
@@ -125,12 +126,15 @@ int main(int argc, char ** argv) {
125
 
126
  //printf("Memsize required = %i\n", sizex*sizex);
127
 
 
 
 
128
  size_t ctx_size = 0;
129
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
130
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
131
  ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
132
- ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
133
- ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
134
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
135
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
136
  ctx_size += 1024*1024*16;
@@ -163,7 +167,7 @@ int main(int argc, char ** argv) {
163
  struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
164
  ggml_set_f32(m2, 2.0f);
165
 
166
- printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
167
  // printf("Creating new tensor m11xm2\n");
168
  struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
169
 
@@ -181,17 +185,16 @@ int main(int argc, char ** argv) {
181
 
182
  TENSOR_DUMP(gf.nodes[0]);
183
 
184
- printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
185
 
186
  int32_t nelements = sizex*sizey;
187
- int32_t ne[2] = { sizex, sizey };
188
 
189
  std::vector<int64_t> hist_cur(1 << 4, 0);
190
 
191
  // Set up a the benchmark matrices
192
  // printf("Creating new tensor q11 & Running quantize\n");
193
- struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
194
- ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
195
 
196
  // Set up a the compute graph
197
  // printf("Creating new tensor q31\n");
@@ -202,8 +205,8 @@ int main(int argc, char ** argv) {
202
 
203
  // Set up a second graph computation to make sure we override the CPU cache lines
204
  // printf("Creating new tensor q12 & Running quantize\n");
205
- struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
206
- ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
207
 
208
  // printf("Creating new tensor q32\n");
209
  struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
@@ -220,7 +223,7 @@ int main(int argc, char ** argv) {
220
  printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
221
 
222
 
223
- // Let's use the F32 result from above as a reference for the q4_0 multiplication
224
  float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
225
 
226
  printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
@@ -250,7 +253,7 @@ int main(int argc, char ** argv) {
250
  // Check that the matrix multiplication result is in the right ballpark
251
  // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
252
  float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
253
- float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
254
  float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
255
 
256
  if (delta > allowed_delta) {
 
 
1
  #include "build-info.h"
2
+ #include "common.h"
3
+ #include "ggml.h"
4
 
5
  #include <locale.h>
6
  #include <assert.h>
 
21
  #pragma warning(disable: 4244 4267) // possible loss of data
22
  #endif
23
 
24
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
25
  struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
26
 
27
  if (plan.work_size > 0) {
 
32
  ggml_graph_compute(graph, &plan);
33
  }
34
 
35
+ static float tensor_sum_elements(const ggml_tensor * tensor) {
36
+ double sum = 0;
37
+ if (tensor->type == GGML_TYPE_F32) {
38
  for (int j = 0; j < tensor->ne[1]; j++) {
39
  for (int k = 0; k < tensor->ne[0]; k++) {
40
+ sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
41
  }
42
  }
43
  }
44
  return sum;
45
  }
46
 
47
+ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
48
  printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
49
  tensor->type, ggml_type_name(tensor->type),
50
  tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
 
59
  int32_t n_iterations = 10;
60
  };
61
 
62
+ static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
63
  fprintf(stderr, "usage: %s [options]\n", argv[0]);
64
  fprintf(stderr, "\n");
65
  fprintf(stderr, "options:\n");
 
100
  exit(1);
101
  }
102
 
103
+ print_build_info();
104
  printf("Starting Test\n");
105
 
106
  // create the ggml context
 
126
 
127
  //printf("Memsize required = %i\n", sizex*sizex);
128
 
129
+ // TODO: perform the bench for all types or for a user specified type
130
+ const ggml_type qtype = GGML_TYPE_Q4_1;
131
+
132
  size_t ctx_size = 0;
133
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
134
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
135
  ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
136
+ ctx_size += sizex*sizey*ggml_type_sizef(qtype);
137
+ ctx_size += sizex*sizey*ggml_type_sizef(qtype);
138
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
139
  ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
140
  ctx_size += 1024*1024*16;
 
167
  struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
168
  ggml_set_f32(m2, 2.0f);
169
 
170
+ printf("\n------ Test 1 - Matrix Mult via F32 code\n");
171
  // printf("Creating new tensor m11xm2\n");
172
  struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
173
 
 
185
 
186
  TENSOR_DUMP(gf.nodes[0]);
187
 
188
+ printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
189
 
190
  int32_t nelements = sizex*sizey;
 
191
 
192
  std::vector<int64_t> hist_cur(1 << 4, 0);
193
 
194
  // Set up a the benchmark matrices
195
  // printf("Creating new tensor q11 & Running quantize\n");
196
+ struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
197
+ ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
198
 
199
  // Set up a the compute graph
200
  // printf("Creating new tensor q31\n");
 
205
 
206
  // Set up a second graph computation to make sure we override the CPU cache lines
207
  // printf("Creating new tensor q12 & Running quantize\n");
208
+ struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
209
+ ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
210
 
211
  // printf("Creating new tensor q32\n");
212
  struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
 
223
  printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
224
 
225
 
226
+ // Let's use the F32 result from above as a reference for the quantized multiplication
227
  float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
228
 
229
  printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
 
253
  // Check that the matrix multiplication result is in the right ballpark
254
  // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
255
  float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
256
+ float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
257
  float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
258
 
259
  if (delta > allowed_delta) {
examples/chat.sh CHANGED
@@ -11,6 +11,6 @@ cd ..
11
  #
12
  # "--keep 48" is based on the contents of prompts/chat-with-bob.txt
13
  #
14
- ./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
15
  --repeat_penalty 1.0 --color -i \
16
  -r "User:" -f prompts/chat-with-bob.txt
 
11
  #
12
  # "--keep 48" is based on the contents of prompts/chat-with-bob.txt
13
  #
14
+ ./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
15
  --repeat_penalty 1.0 --color -i \
16
  -r "User:" -f prompts/chat-with-bob.txt
examples/convert-llama2c-to-ggml/CMakeLists.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ set(TARGET convert-llama2c-to-ggml)
2
+ add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
examples/convert-llama2c-to-ggml/README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Convert llama2.c model to ggml
2
+
3
+ This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
4
+
5
+ To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
6
+
7
+ `$ make -j`
8
+
9
+ After successful compilation, following usage options are available:
10
+ ```
11
+ usage: ./convert-llama2c-to-ggml [options]
12
+
13
+ options:
14
+ -h, --help show this help message and exit
15
+ --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
16
+ --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
17
+ --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
18
+ ```
19
+
20
+ An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
21
+
22
+ `$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
23
+
24
+ Now you can use the model with a command like:
25
+
26
+ `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ADDED
@@ -0,0 +1,963 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+ #include "llama.h"
3
+ #include "common.h"
4
+
5
+ #include <unordered_map>
6
+ #include <vector>
7
+ #include <cassert>
8
+ #include <climits>
9
+ #include <cstring>
10
+ #include <cstdarg>
11
+ #include <ctime>
12
+ #include <random>
13
+ #include <stdexcept>
14
+ #include <sstream>
15
+ #include <algorithm>
16
+ #include <string>
17
+
18
+ // GGUF keys & tensor names.
19
+
20
+ #define KV_GENERAL_ARCHITECTURE "general.architecture"
21
+ #define KV_GENERAL_NAME "general.name"
22
+
23
+ #define KV_TOKENIZER_MODEL "tokenizer.ggml.model"
24
+ #define KV_TOKENIZER_LIST "tokenizer.ggml.tokens"
25
+ #define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type"
26
+ #define KV_TOKENIZER_SCORES "tokenizer.ggml.scores"
27
+ #define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id"
28
+ #define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id"
29
+ #define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id"
30
+ #define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id"
31
+ #define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id"
32
+ #define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json"
33
+
34
+ #define KV_CONTEXT_LENGTH "llama.context_length"
35
+ #define KV_EMBEDDING_LENGTH "llama.embedding_length"
36
+ #define KV_BLOCK_COUNT "llama.block_count"
37
+ #define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length"
38
+ #define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count"
39
+ #define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv"
40
+ #define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon"
41
+ #define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count"
42
+
43
+ #define TN_TOKEN_EMBD "token_embd.weight"
44
+ #define TN_OUTPUT_NORM "output_norm.weight"
45
+ #define TN_OUTPUT "output.weight"
46
+ #define TN_ATTN_NORM "blk.%d.attn_norm.weight"
47
+ #define TN_ATTN_Q "blk.%d.attn_q.weight"
48
+ #define TN_ATTN_K "blk.%d.attn_k.weight"
49
+ #define TN_ATTN_V "blk.%d.attn_v.weight"
50
+ #define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
51
+ #define TN_FFN_NORM "blk.%d.ffn_norm.weight"
52
+ #define TN_FFN_GATE "blk.%d.ffn_gate.weight"
53
+ #define TN_FFN_DOWN "blk.%d.ffn_down.weight"
54
+ #define TN_FFN_UP "blk.%d.ffn_up.weight"
55
+
56
+ #if defined(_MSC_VER)
57
+ #pragma warning(disable: 4244 4267) // possible loss of data
58
+ #endif
59
+
60
+ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
61
+ #define LLAMA_FILE_VERSION_GGJT_V3 3
62
+
63
+ #define TOKENIZER_NAME "llama"
64
+ #define UNKNOWN_TOKEN_ID 0
65
+ #define BOS_TOKEN_ID 1
66
+ #define EOS_TOKEN_ID 2
67
+
68
+ //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
69
+ typedef struct {
70
+ int dim; // transformer dimension
71
+ int hidden_dim; // for ffn layers
72
+ int n_layers; // number of layers
73
+ int n_heads; // number of query heads
74
+ int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
75
+ int vocab_size; // vocabulary size, usually 256 (byte-level)
76
+ int seq_len; // max sequence length
77
+ } Config;
78
+
79
+ struct TransformerWeights {
80
+ // token embedding table
81
+ float* token_embedding_table; // (vocab_size, dim)
82
+ // weights for rmsnorms
83
+ float* rms_att_weight; // (layer, dim) rmsnorm weights
84
+ float* rms_ffn_weight; // (layer, dim)
85
+ // weights for matmuls
86
+ float* wq; // (layer, dim, dim)
87
+ float* wk; // (layer, dim, dim)
88
+ float* wv; // (layer, dim, dim)
89
+ float* wo; // (layer, dim, dim)
90
+ // weights for ffn
91
+ float* w1; // (layer, hidden_dim, dim)
92
+ float* w2; // (layer, dim, hidden_dim)
93
+ float* w3; // (layer, hidden_dim, dim)
94
+ // final rmsnorm
95
+ float* rms_final_weight; // (dim,)
96
+ // freq_cis for RoPE relatively positional embeddings
97
+ // float* freq_cis_real; // (seq_len, dim/2)
98
+ // float* freq_cis_imag; // (seq_len, dim/2)
99
+ // (optional) classifier weights for the logits, on the last layer
100
+ float* wcls;
101
+
102
+ ~TransformerWeights() {
103
+ delete[] token_embedding_table;
104
+ delete[] rms_att_weight;
105
+ delete[] rms_ffn_weight;
106
+ delete[] wq;
107
+ delete[] wk;
108
+ delete[] wv;
109
+ delete[] wo;
110
+ delete[] w1;
111
+ delete[] w2;
112
+ delete[] w3;
113
+ delete[] rms_final_weight;
114
+ delete[] wcls;
115
+ }
116
+ };
117
+
118
+ static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
119
+ // we calloc instead of malloc to keep valgrind happy
120
+ w->token_embedding_table = new float[p->vocab_size * p->dim]();
121
+ printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
122
+
123
+ w->rms_att_weight = new float[p->n_layers * p->dim]();
124
+ printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
125
+
126
+ w->rms_ffn_weight = new float[p->n_layers * p->dim]();
127
+ printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
128
+
129
+ w->wq = new float[p->n_layers * p->dim * p->dim]();
130
+ printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
131
+
132
+ w->wk = new float[p->n_layers * p->dim * p->dim]();
133
+ printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
134
+
135
+ w->wv = new float[p->n_layers * p->dim * p->dim]();
136
+ printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
137
+
138
+ w->wo = new float[p->n_layers * p->dim * p->dim]();
139
+ printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
140
+
141
+ w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
142
+ printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
143
+
144
+ w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
145
+ printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
146
+
147
+ w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
148
+ printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
149
+
150
+ w->rms_final_weight = new float[p->dim]();
151
+ printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
152
+
153
+ if (shared_weights) {
154
+ w->wcls = NULL;
155
+ } else {
156
+ w->wcls = new float[p->vocab_size * p->dim]();
157
+ printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
158
+ }
159
+ }
160
+
161
+ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
162
+ if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
163
+ if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
164
+ if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
165
+ if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
166
+ if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
167
+ if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
168
+ if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
169
+ if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
170
+ if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
171
+ if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
172
+ if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
173
+
174
+ // Skip freq_cis_real & freq_cis_imag
175
+ int head_size = p->dim / p->n_heads;
176
+ fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
177
+
178
+ if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
179
+
180
+ // Check we didn't forget to read anything
181
+ auto curr = ftell(f);
182
+ fseek(f, 0, SEEK_END);
183
+ auto end = ftell(f);
184
+ if (curr != end) {
185
+ printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
186
+ return 1;
187
+ }
188
+
189
+ return 0;
190
+ }
191
+
192
+ static void print_sample_weights(TransformerWeights *w){
193
+ printf("----- Quick print of first of the weight vales of all the variables\n");
194
+ printf("%f\n", w->token_embedding_table[0]);
195
+ printf("%f\n", w->rms_att_weight[0]);
196
+ printf("%f\n", w->rms_ffn_weight[0]);
197
+
198
+ printf("%f\n", w->wq[0]);
199
+ printf("%f\n", w->wk[0]);
200
+ printf("%f\n", w->wv[0]);
201
+ printf("%f\n", w->wo[0]);
202
+ printf("%f\n", w->w1[0]);
203
+ printf("%f\n", w->w2[0]);
204
+ printf("%f\n", w->w3[0]);
205
+ printf("%f\n", w->rms_att_weight[0]);
206
+ if (w->wcls) printf("%f\n", w->wcls[0]);
207
+ }
208
+ ////////////////////////////////////////////////////////////////////////////////////////////////////////////
209
+
210
+ //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
211
+
212
+ struct llama_vocab {
213
+ using id = int32_t;
214
+ using token = std::string;
215
+ using ttype = llama_token_type;
216
+
217
+ struct token_data {
218
+ token text;
219
+ float score;
220
+ ttype type;
221
+ };
222
+
223
+ std::unordered_map<token, id> token_to_id;
224
+ std::vector<token_data> id_to_token;
225
+ };
226
+
227
+ struct my_llama_hparams {
228
+ uint32_t n_vocab = 32000;
229
+ uint32_t n_ctx = 512; // this is provided as user input?
230
+ uint32_t n_embd = 4096;
231
+ uint32_t n_ff = 11008;
232
+ uint32_t n_mult = 4;
233
+ uint32_t n_head = 32;
234
+ uint32_t n_layer = 32;
235
+ uint32_t n_rot = 64;
236
+ bool operator!=(const my_llama_hparams& other) const {
237
+ return memcmp(this, &other, sizeof(my_llama_hparams));
238
+ }
239
+ };
240
+
241
+ struct my_llama_layer {
242
+ // normalization
243
+ struct ggml_tensor * attention_norm;
244
+
245
+ // attention
246
+ struct ggml_tensor * wq;
247
+ struct ggml_tensor * wk;
248
+ struct ggml_tensor * wv;
249
+ struct ggml_tensor * wo;
250
+
251
+ // normalization
252
+ struct ggml_tensor * ffn_norm;
253
+
254
+ // ff
255
+ struct ggml_tensor * w1;
256
+ struct ggml_tensor * w2;
257
+ struct ggml_tensor * w3;
258
+ };
259
+
260
+ struct my_llama_model {
261
+ struct ggml_context * ctx = NULL;
262
+
263
+ std::string name;
264
+
265
+ my_llama_hparams hparams;
266
+
267
+ struct ggml_tensor * tok_embeddings;
268
+
269
+ struct ggml_tensor * norm;
270
+ struct ggml_tensor * output;
271
+
272
+ std::vector<my_llama_layer> layers;
273
+
274
+ uint32_t train_its = 0;
275
+ uint32_t train_samples = 0;
276
+ uint32_t train_tokens = 0;
277
+ };
278
+
279
+ struct train_params {
280
+ const char * fn_vocab_model;
281
+ const char * fn_llama2c_model;
282
+ const char * fn_llama2c_output_model;
283
+ const char * fn_train_data;
284
+ const char * fn_checkpoint_in;
285
+ const char * fn_checkpoint_out;
286
+ const char * fn_model_out;
287
+
288
+ uint32_t seed;
289
+
290
+ int n_ctx;
291
+ int n_embd;
292
+ int n_mult;
293
+ int n_head;
294
+ int n_layer;
295
+ int n_rotmax;
296
+
297
+ int n_threads;
298
+ int n_batch;
299
+ int n_examples;
300
+ int n_predict;
301
+
302
+ int print_info_interval;
303
+ int print_details_interval;
304
+
305
+ bool samples_start_after_nl;
306
+ bool use_adam;
307
+ bool use_flash;
308
+ bool use_scratch;
309
+
310
+ // only adam
311
+ int warmup;
312
+ int cos_decay_steps;
313
+ float cos_decay_restart;
314
+ float cos_decay_alpha;
315
+
316
+ int lbfgs_n_iter;
317
+ int adam_n_iter;
318
+ float adam_alpha;
319
+ float adam_decay;
320
+
321
+ int mem_model_gb;
322
+ int mem_compute_gb;
323
+ int mem_compute0_gb;
324
+ int mem_compute1_gb;
325
+ };
326
+
327
+ static void print_params(struct my_llama_hparams * params) {
328
+ printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
329
+ printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
330
+ printf("%s: n_embd: %d\n", __func__, params->n_embd);
331
+ printf("%s: n_mult: %d\n", __func__, params->n_mult);
332
+ printf("%s: n_head: %d\n", __func__, params->n_head);
333
+ printf("%s: n_ff: %d\n", __func__, params->n_ff);
334
+ printf("%s: n_layer: %d\n", __func__, params->n_layer);
335
+ printf("%s: n_rot: %d\n", __func__, params->n_rot);
336
+ }
337
+
338
+ static void init_model(struct my_llama_model * model) {
339
+ const auto & hparams = model->hparams;
340
+
341
+ const uint32_t n_embd = hparams.n_embd;
342
+ const uint32_t n_layer = hparams.n_layer;
343
+ const uint32_t n_vocab = hparams.n_vocab;
344
+
345
+ const uint32_t n_ff = hparams.n_ff;
346
+ struct ggml_context * ctx = model->ctx;
347
+
348
+ model->train_its = 0;
349
+ model->train_samples = 0;
350
+ model->train_tokens = 0;
351
+
352
+ model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
353
+ printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
354
+
355
+ model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
356
+ printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
357
+
358
+ model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
359
+ printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
360
+
361
+ // printing the per-layer allocations here so we dont print in the for loop.
362
+ printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
363
+ printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
364
+ printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
365
+ printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
366
+
367
+ printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
368
+
369
+ printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
370
+ printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
371
+ printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
372
+
373
+ ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
374
+ ggml_set_name(model->norm, "norm.weight");
375
+ ggml_set_name(model->output, "output.weight");
376
+
377
+ model->layers.resize(n_layer);
378
+ for (uint32_t i = 0; i < n_layer; ++i) {
379
+ auto & layer = model->layers[i];
380
+
381
+ std::string layers_i = "layers." + std::to_string(i);
382
+
383
+ layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
384
+
385
+ layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
386
+ layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
387
+ layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
388
+ layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
389
+
390
+ layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
391
+
392
+ layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
393
+ layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
394
+ layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
395
+
396
+ ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
397
+
398
+ ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
399
+ ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
400
+ ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
401
+ ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
402
+
403
+ ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
404
+
405
+ ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
406
+ ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
407
+ ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
408
+ }
409
+ }
410
+
411
+ static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
412
+ float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
413
+ return *ptr;
414
+ }
415
+
416
+ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
417
+ int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
418
+ return *ptr;
419
+ }
420
+
421
+ static void print_row(struct ggml_tensor * probs, int i) {
422
+ for (int k = 0; k < probs->ne[0]; ++k) {
423
+ float p = get_f32_2d(probs, k, i);
424
+ printf(" %f", p);
425
+ }
426
+ printf("\n");
427
+ }
428
+
429
+ static void print_matrix(struct ggml_tensor * probs) {
430
+ assert(probs->n_dims == 2);
431
+ for (int i = 0; i < probs->ne[1]; ++i) {
432
+ for (int k = 0; k < probs->ne[0]; ++k) {
433
+ float p = get_f32_2d(probs, k, i);
434
+ printf(" %.2f", p);
435
+ }
436
+ printf("\n");
437
+ }
438
+ }
439
+
440
+ #ifdef __GNUC__
441
+ #ifdef __MINGW32__
442
+ __attribute__((format(gnu_printf, 1, 2)))
443
+ #else
444
+ __attribute__((format(printf, 1, 2)))
445
+ #endif
446
+ #endif
447
+ static std::string format(const char * fmt, ...) {
448
+ va_list ap, ap2;
449
+ va_start(ap, fmt);
450
+ va_copy(ap2, ap);
451
+ int size = vsnprintf(NULL, 0, fmt, ap);
452
+ GGML_ASSERT(size >= 0 && size < INT_MAX);
453
+ std::vector<char> buf(size + 1);
454
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
455
+ GGML_ASSERT(size2 == size);
456
+ va_end(ap2);
457
+ va_end(ap);
458
+ return std::string(buf.data(), size);
459
+ }
460
+
461
+ struct llama_file {
462
+ // use FILE * so we don't have to re-open the file to mmap
463
+ FILE * fp;
464
+ size_t size;
465
+
466
+ llama_file(const char * fname, const char * mode) {
467
+ fp = std::fopen(fname, mode);
468
+ if (fp == NULL) {
469
+ size = 0;
470
+ } else {
471
+ seek(0, SEEK_END);
472
+ size = tell();
473
+ seek(0, SEEK_SET);
474
+ }
475
+ }
476
+
477
+ size_t tell() const {
478
+ #ifdef _WIN32
479
+ __int64 ret = _ftelli64(fp);
480
+ #else
481
+ long ret = std::ftell(fp);
482
+ #endif
483
+ GGML_ASSERT(ret != -1); // this really shouldn't fail
484
+ return (size_t) ret;
485
+ }
486
+
487
+ void seek(size_t offset, int whence) {
488
+ #ifdef _WIN32
489
+ int ret = _fseeki64(fp, (__int64) offset, whence);
490
+ #else
491
+ int ret = std::fseek(fp, (long) offset, whence);
492
+ #endif
493
+ GGML_ASSERT(ret == 0); // same
494
+ }
495
+
496
+ void read_raw(void * ptr, size_t size) {
497
+ if (size == 0) {
498
+ return;
499
+ }
500
+ errno = 0;
501
+ std::size_t ret = std::fread(ptr, size, 1, fp);
502
+ if (ferror(fp)) {
503
+ die_fmt("fread failed: %s", strerror(errno));
504
+ }
505
+ if (ret != 1) {
506
+ die("unexpectedly reached end of file");
507
+ }
508
+ }
509
+
510
+ std::uint32_t read_u32() {
511
+ std::uint32_t ret;
512
+ read_raw(&ret, sizeof(ret));
513
+ return ret;
514
+ }
515
+ std::float_t read_f32() {
516
+ std::float_t ret;
517
+ read_raw(&ret, sizeof(ret));
518
+ return ret;
519
+ }
520
+
521
+ std::string read_string(std::uint32_t len) {
522
+ std::vector<char> chars(len);
523
+ read_raw(chars.data(), len);
524
+ return std::string(chars.data(), len);
525
+ }
526
+
527
+ ~llama_file() {
528
+ if (fp) {
529
+ std::fclose(fp);
530
+ }
531
+ }
532
+ };
533
+
534
+ static bool is_ggml_file(const char * filename) {
535
+ llama_file file(filename, "rb");
536
+ if (file.size < 4) {
537
+ return false;
538
+ }
539
+ uint32_t magic = file.read_u32();
540
+ return magic == GGUF_MAGIC;
541
+ }
542
+
543
+ static std::string llama_escape_whitespaces(const std::string & text) {
544
+ std::ostringstream out;
545
+ for (char c : text) {
546
+ if (c == ' ') out << "\xe2\x96\x81";
547
+ else out << c;
548
+ }
549
+ return out.str();
550
+ }
551
+
552
+ static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
553
+ if (is_ggml_file(filename)) {
554
+ struct ggml_context * ctx_data = NULL;
555
+
556
+ struct gguf_init_params params = {
557
+ /*.no_alloc = */ false,
558
+ /*.ctx = */ &ctx_data,
559
+ };
560
+
561
+ struct gguf_context * ctx = gguf_init_from_file(filename, params);
562
+ GGML_ASSERT(ctx != NULL);
563
+
564
+ const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
565
+ GGML_ASSERT(model_idx >= 0);
566
+ std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
567
+ GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
568
+
569
+ const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
570
+ GGML_ASSERT(token_idx >= 0);
571
+
572
+ const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
573
+ GGML_ASSERT(score_idx >= 0);
574
+ const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
575
+
576
+ const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
577
+ GGML_ASSERT(toktype_idx >= 0);
578
+ const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
579
+
580
+ const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
581
+
582
+ vocab->id_to_token.resize(n_vocab);
583
+
584
+ for (uint32_t i = 0; i < n_vocab; i++) {
585
+ std::string word = gguf_get_arr_str(ctx, token_idx, i);
586
+
587
+ vocab->token_to_id[word] = i;
588
+
589
+ auto & token_data = vocab->id_to_token[i];
590
+ token_data.text = std::move(word);
591
+ token_data.score = scores[i];
592
+ token_data.type = (llama_token_type) toktypes[i];
593
+ }
594
+ ggml_free(ctx_data);
595
+ gguf_free(ctx);
596
+ } else {
597
+ // assume llama2.c vocabulary
598
+ printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
599
+ llama_file file(filename, "rb");
600
+ if (!file.fp) {
601
+ die_fmt("%s: %s", strerror(errno), filename);
602
+ }
603
+ const int n_vocab = config->vocab_size;
604
+ /* uint32_t max_token_length = */ file.read_u32(); // unused
605
+ vocab->id_to_token.resize(n_vocab);
606
+ for (llama_vocab::id id=0; id<n_vocab; ++id) {
607
+ float_t score = file.read_f32();
608
+ uint32_t len = file.read_u32();
609
+ std::string text = file.read_string(len);
610
+
611
+ unsigned char byte_val;
612
+ llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
613
+ if (id == UNKNOWN_TOKEN_ID) {
614
+ text = "<unk>";
615
+ type = LLAMA_TOKEN_TYPE_UNKNOWN;
616
+ } else if (id == BOS_TOKEN_ID) {
617
+ text = "<s>";
618
+ type = LLAMA_TOKEN_TYPE_CONTROL;
619
+ } else if (id == EOS_TOKEN_ID) {
620
+ text = "</s>";
621
+ type = LLAMA_TOKEN_TYPE_CONTROL;
622
+ } else if (text.empty()) {
623
+ type = LLAMA_TOKEN_TYPE_CONTROL;
624
+ } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
625
+ // Text of byte tokens is already in the expected format.
626
+ type = LLAMA_TOKEN_TYPE_BYTE;
627
+ } else {
628
+ type = LLAMA_TOKEN_TYPE_NORMAL;
629
+ }
630
+ text = llama_escape_whitespaces(text);
631
+
632
+ vocab->id_to_token[id].text = text;
633
+ vocab->id_to_token[id].score = score;
634
+ vocab->id_to_token[id].type = type;
635
+ vocab->token_to_id.emplace(text, id);
636
+ }
637
+ }
638
+ }
639
+
640
+ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
641
+ int ct;
642
+ switch (gg_weights->n_dims){
643
+ case 1:
644
+ ct = 0;
645
+ for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
646
+ float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
647
+ *ptr = karpathy_weights[ct];
648
+ ct++;
649
+ }
650
+ break;
651
+ case 2:
652
+ ct = 0;
653
+ for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
654
+ for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
655
+ float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
656
+ *ptr = karpathy_weights[ct];
657
+ ct++;
658
+ }
659
+ }
660
+ break;
661
+ case 3:
662
+ ct = 0;
663
+ for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
664
+ for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
665
+ for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
666
+ float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
667
+ *ptr = karpathy_weights[ct];
668
+ ct++;
669
+ }
670
+ }
671
+ }
672
+ break;
673
+ }
674
+ }
675
+
676
+ static void save_as_llama_model(
677
+ struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
678
+ ) {
679
+ // convert AK weights into GG weights one by one.
680
+ // w->token_embedding_table -> model->tok_embeddings
681
+ // float* -> struct ggml_tensor
682
+ convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
683
+ convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
684
+
685
+ convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
686
+ //print_row(model->norm, 0);
687
+
688
+ // for rms-att-weight
689
+ int row_length = model->hparams.n_embd;
690
+ int n_ff = model->hparams.n_ff;
691
+
692
+ for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
693
+ auto & layer = model->layers[i];
694
+ // 1d
695
+ convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
696
+ convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
697
+
698
+ // from 3d matrix layer x dim x dim to 2d matrix dim x dim
699
+ convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
700
+ convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
701
+ convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
702
+ convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
703
+
704
+ convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
705
+ convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
706
+ convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
707
+ }
708
+
709
+ struct gguf_context * ctx = gguf_init_empty();
710
+
711
+ std::vector<const char*> tokens;
712
+ std::vector<float> scores;
713
+ std::vector<llama_token_type> token_types;
714
+ for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
715
+ tokens.push_back(token_data.text.c_str());
716
+ scores.push_back(token_data.score);
717
+ token_types.push_back(token_data.type);
718
+ }
719
+ gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
720
+ gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
721
+ gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
722
+
723
+ gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
724
+
725
+ gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
726
+ gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
727
+
728
+ // special tokens
729
+ gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
730
+ gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
731
+ gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
732
+ gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
733
+ gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
734
+
735
+ gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
736
+ gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
737
+ gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
738
+ gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
739
+ // n_head_kv is optional, default to n_head
740
+ // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
741
+ gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
742
+ gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
743
+ gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
744
+
745
+ // write tensors
746
+ ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
747
+ gguf_add_tensor(ctx, model->tok_embeddings);
748
+
749
+ ggml_set_name(model->norm, TN_OUTPUT_NORM);
750
+ gguf_add_tensor(ctx, model->norm);
751
+
752
+ ggml_set_name(model->output, TN_OUTPUT);
753
+ gguf_add_tensor(ctx, model->output);
754
+
755
+ for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
756
+ auto & layer = model->layers[i];
757
+
758
+ ggml_format_name(layer.wq, TN_ATTN_Q, i);
759
+ gguf_add_tensor(ctx, layer.wq);
760
+
761
+ ggml_format_name(layer.wk, TN_ATTN_K, i);
762
+ gguf_add_tensor(ctx, layer.wk);
763
+
764
+ ggml_format_name(layer.wv, TN_ATTN_V, i);
765
+ gguf_add_tensor(ctx, layer.wv);
766
+
767
+ ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
768
+ gguf_add_tensor(ctx, layer.wo);
769
+
770
+ ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
771
+ gguf_add_tensor(ctx, layer.attention_norm);
772
+
773
+ ggml_format_name(layer.w1, TN_FFN_GATE, i);
774
+ gguf_add_tensor(ctx, layer.w1);
775
+
776
+ ggml_format_name(layer.w2, TN_FFN_DOWN, i);
777
+ gguf_add_tensor(ctx, layer.w2);
778
+
779
+ ggml_format_name(layer.w3, TN_FFN_UP, i);
780
+ gguf_add_tensor(ctx, layer.w3);
781
+
782
+ ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
783
+ gguf_add_tensor(ctx, layer.ffn_norm);
784
+ }
785
+
786
+ gguf_write_to_file(ctx, filename, false);
787
+ gguf_free(ctx);
788
+ }
789
+
790
+ static struct train_params get_default_train_params() {
791
+ struct train_params params;
792
+ params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
793
+ params.fn_llama2c_output_model = "ak_llama_model.bin";
794
+ params.fn_train_data = "shakespeare.txt";
795
+ params.fn_checkpoint_in = "checkpoint.bin";
796
+ params.fn_checkpoint_out = "checkpoint.bin";
797
+ params.fn_model_out = "ggml-checkpoint-f32.bin";
798
+
799
+ params.seed = -1;
800
+
801
+ params.n_ctx = 128;
802
+ params.n_embd = 256;
803
+ params.n_mult = 256;
804
+ params.n_head = 8;
805
+ params.n_layer = 16;
806
+ params.n_rotmax = 64;
807
+
808
+ params.n_threads = 6;
809
+ params.n_batch = 8;
810
+ params.n_examples = 8;
811
+ params.n_predict = 1024;
812
+
813
+ params.print_info_interval = 1;
814
+ params.print_details_interval = 2;
815
+
816
+ params.samples_start_after_nl = false;
817
+ params.use_adam = true;
818
+ params.use_flash = true;
819
+ params.use_scratch = true;
820
+
821
+ // only adam
822
+ params.warmup = 100;
823
+ params.cos_decay_steps = 1000;
824
+ params.cos_decay_restart = 1.1f;
825
+ params.cos_decay_alpha = 0.0f;
826
+
827
+ params.lbfgs_n_iter = 16;
828
+ params.adam_n_iter = 16;
829
+ params.adam_alpha = 1e-3f;
830
+ params.adam_decay = 1e-3f;
831
+
832
+ params.mem_model_gb = 2;
833
+ params.mem_compute_gb = 24;
834
+ params.mem_compute0_gb = 8;
835
+ params.mem_compute1_gb = 2;
836
+
837
+ return params;
838
+ }
839
+
840
+ static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
841
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
842
+ fprintf(stderr, "\n");
843
+ fprintf(stderr, "options:\n");
844
+ fprintf(stderr, " -h, --help show this help message and exit\n");
845
+ fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
846
+ fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
847
+ fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
848
+ fprintf(stderr, "\n");
849
+ }
850
+
851
+ static bool params_parse(int argc, char ** argv, struct train_params * params) {
852
+ bool invalid_param = false;
853
+ bool reqd_param_found = false;
854
+ std::string arg;
855
+ struct train_params default_params = get_default_train_params();
856
+ const std::string arg_prefix = "--";
857
+
858
+ for (int i = 1; i < argc; i++) {
859
+ arg = argv[i];
860
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
861
+ std::replace(arg.begin(), arg.end(), '_', '-');
862
+ }
863
+
864
+ if (arg == "--copy-vocab-from-model") {
865
+ if (++i >= argc) {
866
+ invalid_param = true;
867
+ break;
868
+ }
869
+ params->fn_vocab_model = argv[i];
870
+ } else if (arg == "--llama2c-model") {
871
+ if (++i >= argc) {
872
+ invalid_param = true;
873
+ break;
874
+ }
875
+ reqd_param_found = true;
876
+ params->fn_llama2c_model = argv[i];
877
+ } else if (arg == "--llama2c-output-model") {
878
+ if (++i >= argc) {
879
+ invalid_param = true;
880
+ break;
881
+ }
882
+ params->fn_llama2c_output_model = argv[i];
883
+ } else if (arg == "-h" || arg == "--help") {
884
+ print_usage(argc, argv, &default_params);
885
+ exit(0);
886
+ } else {
887
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
888
+ print_usage(argc, argv, &default_params);
889
+ exit(1);
890
+ }
891
+ }
892
+ if (invalid_param) {
893
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
894
+ print_usage(argc, argv, &default_params);
895
+ exit(1);
896
+ }
897
+ if (!reqd_param_found){
898
+ fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
899
+ print_usage(argc, argv, &default_params);
900
+ exit(1);
901
+ }
902
+
903
+ return true;
904
+ }
905
+
906
+ static std::string basename(const std::string &path) {
907
+ size_t pos = path.find_last_of("/\\");
908
+ if (pos == std::string::npos) {
909
+ return path;
910
+ }
911
+ return path.substr(pos + 1);
912
+ }
913
+
914
+ int main(int argc, char ** argv) {
915
+ struct train_params params = get_default_train_params();
916
+ if (!params_parse(argc, argv, &params)) {
917
+ return 1;
918
+ }
919
+ Config config;
920
+ TransformerWeights weights = {};
921
+ {
922
+ FILE *file = fopen(params.fn_llama2c_model, "rb");
923
+ if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
924
+ // read in the config header
925
+ if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
926
+ auto shared_weights = config.vocab_size > 0;
927
+ config.vocab_size = abs(config.vocab_size);
928
+
929
+ // read in the Transformer weights
930
+ malloc_weights(&weights, &config, shared_weights);
931
+ if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
932
+ fclose(file);
933
+ }
934
+
935
+ struct llama_vocab vocab;
936
+ load_vocab(params.fn_vocab_model, &config, &vocab);
937
+
938
+ struct my_llama_model model;
939
+ model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
940
+ model.hparams.n_ctx = params.n_ctx;
941
+ model.hparams.n_embd = config.dim; //params.n_embd;
942
+ model.hparams.n_ff = config.hidden_dim;
943
+ model.hparams.n_mult = 32;//params.n_mult;
944
+ model.hparams.n_head = config.n_heads; //params.n_head;
945
+ model.hparams.n_layer = config.n_layers; //params.n_layer;
946
+ model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
947
+ print_params(&model.hparams);
948
+ struct ggml_init_params lcparams;
949
+ lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
950
+ lcparams.mem_buffer = NULL;
951
+ lcparams.no_alloc = false;
952
+
953
+ model.ctx = ggml_init(lcparams);
954
+
955
+ init_model(&model);
956
+ model.name = basename(params.fn_llama2c_model);
957
+ save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
958
+
959
+ printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
960
+
961
+ ggml_free(model.ctx);
962
+ return 0;
963
+ }
examples/embd-input/embd-input-lib.cpp CHANGED
@@ -1,8 +1,5 @@
1
- // Defines sigaction on msys:
2
- #ifndef _GNU_SOURCE
3
- #define _GNU_SOURCE
4
- #endif
5
-
6
  #include "embd-input.h"
7
 
8
  #include <cassert>
@@ -23,11 +20,11 @@ extern "C" {
23
  struct MyModel* create_mymodel(int argc, char ** argv) {
24
  gpt_params params;
25
 
26
- if (gpt_params_parse(argc, argv, params) == false) {
27
  return nullptr;
28
  }
29
 
30
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
31
 
32
  if (params.seed == LLAMA_DEFAULT_SEED) {
33
  params.seed = uint32_t(time(NULL));
@@ -167,7 +164,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
167
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
168
 
169
  // TODO: Apply penalties
170
- // float nl_logit = logits[llama_token_nl()];
171
  // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
172
  // llama_sample_repetition_penalty(ctx, &candidates_p,
173
  // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -176,7 +173,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
176
  // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
177
  // last_n_repeat, alpha_frequency, alpha_presence);
178
  // if (!penalize_nl) {
179
- // logits[llama_token_nl()] = nl_logit;
180
  // }
181
 
182
  if (temp <= 0) {
@@ -211,10 +208,10 @@ const char * sampling(struct MyModel * mymodel) {
211
  llama_context * ctx = mymodel->ctx;
212
  int id = sampling_id(mymodel);
213
  static std::string ret;
214
- if (id == llama_token_eos()) {
215
  ret = "</s>";
216
  } else {
217
- ret = llama_token_to_str(ctx, id);
218
  }
219
  eval_id(mymodel, id);
220
  return ret.c_str();
 
1
+ #include "build-info.h"
2
+ #include "common.h"
 
 
 
3
  #include "embd-input.h"
4
 
5
  #include <cassert>
 
20
  struct MyModel* create_mymodel(int argc, char ** argv) {
21
  gpt_params params;
22
 
23
+ if (!gpt_params_parse(argc, argv, params)) {
24
  return nullptr;
25
  }
26
 
27
+ print_build_info();
28
 
29
  if (params.seed == LLAMA_DEFAULT_SEED) {
30
  params.seed = uint32_t(time(NULL));
 
164
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
165
 
166
  // TODO: Apply penalties
167
+ // float nl_logit = logits[llama_token_nl(ctx)];
168
  // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
169
  // llama_sample_repetition_penalty(ctx, &candidates_p,
170
  // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
 
173
  // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
174
  // last_n_repeat, alpha_frequency, alpha_presence);
175
  // if (!penalize_nl) {
176
+ // logits[llama_token_nl(ctx)] = nl_logit;
177
  // }
178
 
179
  if (temp <= 0) {
 
208
  llama_context * ctx = mymodel->ctx;
209
  int id = sampling_id(mymodel);
210
  static std::string ret;
211
+ if (id == llama_token_eos(ctx)) {
212
  ret = "</s>";
213
  } else {
214
+ ret = llama_token_to_piece(ctx, id);
215
  }
216
  eval_id(mymodel, id);
217
  return ret.c_str();