Spaces:

Illumotion
/

Koboldcpp

Build error

App Files Files Community

Illumotion commited on Oct 14, 2023

Commit

7243d06

1 Parent(s): da7d918

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

.gitignore +0 -4
CMakeLists.txt +0 -2
Makefile +14 -16
Package.swift +3 -4
colab.ipynb +1 -1
common/CMakeLists.txt +0 -2
common/common.cpp +172 -56
common/common.h +39 -4
convert-refact-hf-to-gguf.py +63 -8
examples/CMakeLists.txt +0 -1
examples/batched/batched.cpp +1 -1
examples/embd-input/embd-input-lib.cpp +9 -10
examples/infill/infill.cpp +12 -43
examples/main/main.cpp +11 -13
examples/parallel/parallel.cpp +3 -5
examples/save-load-state/save-load-state.cpp +2 -3
examples/server/index.html.hpp +0 -0
examples/server/public/index.html +58 -133
examples/server/server.cpp +60 -93
examples/speculative/speculative.cpp +2 -10
ggml-alloc.c +107 -62
ggml-alloc.h +5 -11
ggml-cuda.cu +78 -500
ggml-cuda.h +0 -4
ggml-metal.h +1 -18
ggml-metal.m +9 -152
ggml-metal.metal +6 -12
ggml.c +45 -23
ggml.h +7 -9
gguf-py/gguf/gguf.py +42 -70
gpttype_adapter.cpp +1 -1
koboldcpp.py +53 -105
llama.cpp +41 -820
otherarch/llama_v3.cpp +8 -7
spm-headers/ggml.h +7 -9
tests/test-tokenizer-0-falcon.cpp +4 -4
tests/test-tokenizer-0-falcon.py +4 -5
tests/test-tokenizer-0-llama.cpp +3 -1
tests/test-tokenizer-0-llama.py +4 -3

.gitignore CHANGED Viewed

@@ -45,7 +45,6 @@ models-mnt
 /server
 /simple
 /batched
-/batched-bench
 /export-lora
 /finetune
 /speculative
@@ -107,6 +106,3 @@ tests/test-tokenizer-1-bpe
 rocblas.dll
 hipblas.dll
 koboldcpp_hipblas.so
-# Jetbrains idea folder
-.idea/

 /server
 /simple
 /batched
 /export-lora
 /finetune
 /speculative
 rocblas.dll
 hipblas.dll
 koboldcpp_hipblas.so

CMakeLists.txt CHANGED Viewed

@@ -356,8 +356,6 @@ add_library(ggml OBJECT
             ggml.h
             ggml-alloc.c
             ggml-alloc.h
-            ggml-backend.c
-            ggml-backend.h
             k_quants.h
             k_quants.c
             ${GGML_SOURCES_CUDA})

             ggml.h
             ggml-alloc.c
             ggml-alloc.h
             k_quants.h
             k_quants.c
             ${GGML_SOURCES_CUDA})

Makefile CHANGED Viewed

@@ -372,8 +372,6 @@ endif # LLAMA_NO_K_QUANTS
 #there's no intrinsics or special gpu ops used here, so we can have a universal object
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 	$(CC)  $(CFLAGS) -c $< -o $@
-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS) -c $< -o $@
 #version 2 libs
 ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
@@ -404,7 +402,7 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
 	$(CC) $(CFLAGS) -c $< -o $@
 # intermediate objects
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common.o: common/common.cpp common/common.h common/log.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -429,7 +427,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
 clean:
 	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
-main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
@@ -440,11 +438,11 @@ gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
 #generated libraries
-koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(DEFAULT_BUILD)
 ifdef OPENBLAS_BUILD
-koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(OPENBLAS_BUILD)
 else
 koboldcpp_openblas:
@@ -452,7 +450,7 @@ koboldcpp_openblas:
 endif
 ifdef FAILSAFE_BUILD
-koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(FAILSAFE_BUILD)
 else
 koboldcpp_failsafe:
@@ -460,7 +458,7 @@ koboldcpp_failsafe:
 endif
 ifdef NOAVX2_BUILD
-koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(NOAVX2_BUILD)
 else
 koboldcpp_noavx2:
@@ -468,7 +466,7 @@ koboldcpp_noavx2:
 endif
 ifdef CLBLAST_BUILD
-koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
 	$(CLBLAST_BUILD)
 else
 koboldcpp_clblast:
@@ -476,7 +474,7 @@ koboldcpp_clblast:
 endif
 ifdef CUBLAS_BUILD
-koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
 	$(CUBLAS_BUILD)
 else
 koboldcpp_cublas:
@@ -484,7 +482,7 @@ koboldcpp_cublas:
 endif
 ifdef HIPBLAS_BUILD
-koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(HIP_OBJS) $(OBJS)
 	$(HIPBLAS_BUILD)
 else
 koboldcpp_hipblas:
@@ -492,15 +490,15 @@ koboldcpp_hipblas:
 endif
 # tools
-quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

 #there's no intrinsics or special gpu ops used here, so we can have a universal object
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 	$(CC)  $(CFLAGS) -c $< -o $@
 #version 2 libs
 ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
 	$(CC) $(CFLAGS) -c $< -o $@
 # intermediate objects
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common.o: common/common.cpp common/common.h common/log.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 clean:
 	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
+main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 #generated libraries
+koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(DEFAULT_BUILD)
 ifdef OPENBLAS_BUILD
+koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(OPENBLAS_BUILD)
 else
 koboldcpp_openblas:
 endif
 ifdef FAILSAFE_BUILD
+koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(FAILSAFE_BUILD)
 else
 koboldcpp_failsafe:
 endif
 ifdef NOAVX2_BUILD
+koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(NOAVX2_BUILD)
 else
 koboldcpp_noavx2:
 endif
 ifdef CLBLAST_BUILD
+koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(CLBLAST_BUILD)
 else
 koboldcpp_clblast:
 endif
 ifdef CUBLAS_BUILD
+koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
 	$(CUBLAS_BUILD)
 else
 koboldcpp_cublas:
 endif
 ifdef HIPBLAS_BUILD
+koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
 	$(HIPBLAS_BUILD)
 else
 koboldcpp_hipblas:
 endif
 # tools
+quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

Package.swift CHANGED Viewed

@@ -1,10 +1,10 @@
-// swift-tools-version:5.5
 import PackageDescription
 #if arch(arm) || arch(arm64)
 let platforms: [SupportedPlatform]? = [
-    .macOS(.v12),
     .iOS(.v14),
     .watchOS(.v4),
     .tvOS(.v14)
@@ -41,13 +41,12 @@ let package = Package(
                 "ggml.c",
                 "llama.cpp",
                 "ggml-alloc.c",
-                "ggml-backend.c",
                 "k_quants.c",
             ] + additionalSources,
             resources: resources,
             publicHeadersPath: "spm-headers",
             cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                 .define("GGML_USE_K_QUANTS"),
                 .define("GGML_USE_ACCELERATE")
                 // NOTE: NEW_LAPACK will required iOS version 16.4+

+// swift-tools-version:5.3
 import PackageDescription
 #if arch(arm) || arch(arm64)
 let platforms: [SupportedPlatform]? = [
+    .macOS(.v11),
     .iOS(.v14),
     .watchOS(.v4),
     .tvOS(.v14)
                 "ggml.c",
                 "llama.cpp",
                 "ggml-alloc.c",
                 "k_quants.c",
             ] + additionalSources,
             resources: resources,
             publicHeadersPath: "spm-headers",
             cSettings: [
+                .unsafeFlags(["-Wno-shorten-64-to-32"]),
                 .define("GGML_USE_K_QUANTS"),
                 .define("GGML_USE_ACCELERATE")
                 // NOTE: NEW_LAPACK will required iOS version 16.4+

colab.ipynb CHANGED Viewed

@@ -33,7 +33,7 @@
         "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
         "!sleep 10\r\n",
         "!cat nohup.out\r\n",
-        "!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers\r\n"
       ]
     }
   ],

         "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
         "!sleep 10\r\n",
         "!cat nohup.out\r\n",
+        "!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers --hordeconfig concedo\r\n"
       ]
     }
   ],

common/CMakeLists.txt CHANGED Viewed

@@ -5,8 +5,6 @@ set(TARGET common)
 add_library(${TARGET} OBJECT
     common.h
     common.cpp
-    sampling.h
-    sampling.cpp
     console.h
     console.cpp
     grammar-parser.h

 add_library(${TARGET} OBJECT
     common.h
     common.cpp
     console.h
     console.cpp
     grammar-parser.h

common/common.cpp CHANGED Viewed

@@ -107,7 +107,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     std::string arg;
     gpt_params default_params;
     const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sampling_params;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -185,7 +184,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            sparams.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -217,73 +216,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            sparams.top_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.temp = std::stof(argv[i]);
         } else if (arg == "--tfs") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.tfs_z = std::stof(argv[i]);
         } else if (arg == "--typical") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.typical_p = std::stof(argv[i]);
         } else if (arg == "--repeat-last-n") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.repeat_last_n = std::stoi(argv[i]);
         } else if (arg == "--repeat-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.repeat_penalty = std::stof(argv[i]);
         } else if (arg == "--frequency-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.frequency_penalty = std::stof(argv[i]);
         } else if (arg == "--presence-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.presence_penalty = std::stof(argv[i]);
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.mirostat = std::stoi(argv[i]);
         } else if (arg == "--mirostat-lr") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.mirostat_eta = std::stof(argv[i]);
         } else if (arg == "--mirostat-ent") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.mirostat_tau = std::stof(argv[i]);
         } else if (arg == "--cfg-negative-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.cfg_negative_prompt = argv[i];
         } else if (arg == "--cfg-negative-prompt-file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -295,16 +294,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-            if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-                sparams.cfg_negative_prompt.pop_back();
             }
         } else if (arg == "--cfg-scale") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            sparams.cfg_scale = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -513,7 +512,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
-            sparams.penalize_nl = false;
         } else if (arg == "-l" || arg == "--logit-bias") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -525,7 +524,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             std::string value_str;
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
                 } else {
                     throw std::exception();
                 }
@@ -628,8 +627,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sampling_params;
     printf("usage: %s [options]\n", argv[0]);
     printf("\n");
     printf("options:\n");
@@ -662,19 +659,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
     printf("  --mirostat N          use Mirostat sampling.\n");
     printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
     printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
     printf("                        modifies the likelihood of token appearing in the completion,\n");
     printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
@@ -685,7 +682,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        negative prompt to use for guidance. (default: empty)\n");
     printf("  --cfg-negative-prompt-file FNAME\n");
     printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
     printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
@@ -693,7 +690,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --no-penalize-nl      do not penalize newline token\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@@ -843,7 +840,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     }
     if (params.ignore_eos) {
-        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
     }
     {
@@ -935,6 +932,127 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
     return result;
 }
 //
 // YAML utils
 //
@@ -1086,8 +1204,6 @@ std::string get_sortable_timestamp() {
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sampling_params;
     fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
     fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
     fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
@@ -1134,21 +1250,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
     fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
     dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
     fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
     fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
-    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
     fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
     dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
@@ -1161,7 +1277,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
     fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
         if (ignore_eos && lb.first == logit_bias_eos->first) {
             continue;
         }
@@ -1185,30 +1301,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
-    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
-    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
-    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
     fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
     fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
     fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
     fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
-    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
     fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
-    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
     fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
     dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
     fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
     fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
     fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
     dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
     fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
     fprintf(stream, "reverse_prompt:\n");
     for (std::string ap : params.antiprompt) {
@@ -1226,15 +1342,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
     fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
     fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
-    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
-    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
     fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
-    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
-    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }

     std::string arg;
     gpt_params default_params;
     const std::string arg_prefix = "--";
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
                 invalid_param = true;
                 break;
             }
+            params.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx-size") {
             if (++i >= argc) {
                 invalid_param = true;
                 invalid_param = true;
                 break;
             }
+            params.top_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.temp = std::stof(argv[i]);
         } else if (arg == "--tfs") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.tfs_z = std::stof(argv[i]);
         } else if (arg == "--typical") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.typical_p = std::stof(argv[i]);
         } else if (arg == "--repeat-last-n") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.repeat_last_n = std::stoi(argv[i]);
         } else if (arg == "--repeat-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.repeat_penalty = std::stof(argv[i]);
         } else if (arg == "--frequency-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.frequency_penalty = std::stof(argv[i]);
         } else if (arg == "--presence-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.presence_penalty = std::stof(argv[i]);
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.mirostat = std::stoi(argv[i]);
         } else if (arg == "--mirostat-lr") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.mirostat_eta = std::stof(argv[i]);
         } else if (arg == "--mirostat-ent") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.mirostat_tau = std::stof(argv[i]);
         } else if (arg == "--cfg-negative-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.cfg_negative_prompt = argv[i];
         } else if (arg == "--cfg-negative-prompt-file") {
             if (++i >= argc) {
                 invalid_param = true;
                 invalid_param = true;
                 break;
             }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
+            if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
+                params.cfg_negative_prompt.pop_back();
             }
         } else if (arg == "--cfg-scale") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            params.cfg_scale = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
+            params.penalize_nl = false;
         } else if (arg == "-l" || arg == "--logit-bias") {
             if (++i >= argc) {
                 invalid_param = true;
             std::string value_str;
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
                 } else {
                     throw std::exception();
                 }
 }
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("usage: %s [options]\n", argv[0]);
     printf("\n");
     printf("options:\n");
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
     printf("  --mirostat N          use Mirostat sampling.\n");
     printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
+    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
     printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
     printf("                        modifies the likelihood of token appearing in the completion,\n");
     printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
     printf("                        negative prompt to use for guidance. (default: empty)\n");
     printf("  --cfg-negative-prompt-file FNAME\n");
     printf("                        negative prompt file to use for guidance. (default: empty)\n");
+    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
     printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
     printf("  --no-penalize-nl      do not penalize newline token\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
     printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
     }
     if (params.ignore_eos) {
+        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
     }
     {
     return result;
 }
+//
+// Sampling utils
+//
+llama_token llama_sample_token(
+                  struct llama_context * ctx,
+                  struct llama_context * ctx_guidance,
+                  struct llama_grammar * grammar,
+               const struct gpt_params & params,
+        const std::vector<llama_token> & last_tokens,
+         std::vector<llama_token_data> & candidates,
+                                   int   idx) {
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
+    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+    const float   repeat_penalty  = params.repeat_penalty;
+    const float   alpha_presence  = params.presence_penalty;
+    const float   alpha_frequency = params.frequency_penalty;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
+    const bool    penalize_nl     = params.penalize_nl;
+    llama_token id = 0;
+    float * logits = llama_get_logits_ith(ctx, idx);
+    // Apply params.logit_bias map
+    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        logits[it->first] += it->second;
+    }
+    candidates.clear();
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+    if (ctx_guidance) {
+        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
+    }
+    // apply penalties
+    if (!last_tokens.empty()) {
+        const float nl_logit = logits[llama_token_nl(ctx)];
+        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
+        llama_sample_repetition_penalty(ctx, &cur_p,
+                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                last_n_repeat, repeat_penalty);
+        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
+                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                last_n_repeat, alpha_frequency, alpha_presence);
+        if (!penalize_nl) {
+            for (size_t idx = 0; idx < cur_p.size; idx++) {
+                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
+                    cur_p.data[idx].logit = nl_logit;
+                    break;
+                }
+            }
+        }
+    }
+    if (grammar != NULL) {
+        llama_sample_grammar(ctx, &cur_p, grammar);
+    }
+    if (temp <= 0) {
+        // Greedy sampling
+        id = llama_sample_token_greedy(ctx, &cur_p);
+    } else {
+        if (mirostat == 1) {
+            static float mirostat_mu = 2.0f * mirostat_tau;
+            const int mirostat_m = 100;
+            llama_sample_temp(ctx, &cur_p, temp);
+            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+        } else if (mirostat == 2) {
+            static float mirostat_mu = 2.0f * mirostat_tau;
+            llama_sample_temp(ctx, &cur_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+        } else {
+            // Temperature sampling
+            size_t min_keep = std::max(1, params.n_probs);
+            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
+            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
+            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
+            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
+            llama_sample_temp(ctx, &cur_p, temp);
+            {
+                const int n_top = 10;
+                LOG("top %d candidates:\n", n_top);
+                for (int i = 0; i < n_top; i++) {
+                    const llama_token id = cur_p.data[i].id;
+                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
+                }
+            }
+            id = llama_sample_token(ctx, &cur_p);
+            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
+        }
+    }
+    // printf("`%d`", candidates_p.size);
+    if (grammar != NULL) {
+        llama_grammar_accept_token(ctx, grammar, id);
+    }
+    return id;
+}
 //
 // YAML utils
 //
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
     fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
     fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
     fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
     fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
     fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
+    dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
+    fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
     dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
     fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
     fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
+    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
+    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
     fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
     dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
     fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
     fprintf(stream, "logit_bias:\n");
+    for (std::pair<llama_token, float> lb : params.logit_bias) {
         if (ignore_eos && lb.first == logit_bias_eos->first) {
             continue;
         }
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
+    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
+    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
+    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
     fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
     fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
     fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
     fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
+    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
     fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
+    fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
     fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
     dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
     fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
     fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
     fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
     dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
     fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
     fprintf(stream, "reverse_prompt:\n");
     for (std::string ap : params.antiprompt) {
     fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
     fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
     fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
+    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
+    fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
     fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
+    fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
+    fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }

common/common.h CHANGED Viewed

@@ -4,8 +4,6 @@
 #include "llama.h"
-#include "sampling.h"
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
@@ -51,6 +49,7 @@ struct gpt_params {
     int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
     int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f; // RoPE base frequency
     float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
@@ -68,8 +67,13 @@ struct gpt_params {
     int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
-    // // sampling parameters
-    struct llama_sampling_params sampling_params;
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
@@ -111,6 +115,7 @@ struct gpt_params {
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
     bool instruct          = false; // instruction mode (used for Alpaca models)
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
@@ -175,6 +180,36 @@ std::string llama_detokenize_bpe(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens);
 //
 // YAML utils
 //

 #include "llama.h"
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
     int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
+    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
     int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
     float   rope_freq_base                  = 0.0f; // RoPE base frequency
     float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
     int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+    // Classifier-Free Guidance
+    // https://arxiv.org/abs/2306.17806
+    std::string cfg_negative_prompt;       // string to help guidance
+    float       cfg_scale         = 1.f;   // How strong is guidance
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
     bool instruct          = false; // instruction mode (used for Alpaca models)
+    bool penalize_nl       = true;  // consider newlines as a repeatable token
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
                          llama_context * ctx,
         const std::vector<llama_token> & tokens);
+//
+// Sampling utils
+//
+// this is a common sampling function used across the examples for convenience
+// it can serve as a starting point for implementing your own sampling function
+//
+// required:
+//  - ctx:    context to use for sampling
+//  - params: sampling parameters
+//
+// optional:
+//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
+//  - grammar:       grammar to use for sampling, ignore if NULL
+//  - last_tokens:   needed for repetition penalty, ignore if empty
+//  - idx:           sample from llama_get_logits_ith(ctx, idx)
+//
+// returns:
+//  - token:      sampled token
+//  - candidates: vector of candidate tokens
+//
+llama_token llama_sample_token(
+                  struct llama_context * ctx,
+                  struct llama_context * ctx_guidance,
+                  struct llama_grammar * grammar,
+               const struct gpt_params & params,
+        const std::vector<llama_token> & last_tokens,
+         std::vector<llama_token_data> & candidates,
+                                   int   idx = 0);
 //
 // YAML utils
 //

convert-refact-hf-to-gguf.py CHANGED Viewed

@@ -17,6 +17,33 @@ if "NO_LOCAL_GGUF" not in os.environ:
     sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
 import gguf
 def count_model_parts(dir_model: Path) -> int:
     num_parts = 0
     for filename in os.listdir(dir_model):
@@ -126,25 +153,53 @@ tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
-print("gguf: get gpt2 tokenizer vocab")
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)

     sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
 import gguf
+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    return dict(zip(bs, (chr(n) for n in cs)))
 def count_model_parts(dir_model: Path) -> int:
     num_parts = 0
     for filename in os.listdir(dir_model):
 scores: list[float] = []
 toktypes: list[int] = []
+tokenizer_json_file = dir_model / "tokenizer.json"
+if not tokenizer_json_file.is_file():
+    print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
+    sys.exit(1)
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
+with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+    tokenizer_json = json.load(f)
+print("gguf: get gpt2 tokenizer vocab")
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = (
+    hparams["vocab_size"]
+    if "vocab_size" in hparams
+    else len(tokenizer_json["model"]["vocab"])
+)
+tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}
 for i in range(vocab_size):
+    if i in reverse_vocab:
+        text = reverse_vocab[i]
+        try:
+            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        except KeyError:
+            text = bytearray()
+            for c in reverse_vocab[i]:
+                if ord(c) < 256:  # single byte character
+                    text.append(byte_decoder[ord(c)])
+                else:  # multibyte special token character
+                    text.extend(c.encode("utf-8"))
+    else:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)
+    tokens.append(text)
+    scores.append(0.0)  # dymmy
+    toktypes.append(gguf.TokenType.NORMAL)  # dummy
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)

examples/CMakeLists.txt CHANGED Viewed

@@ -25,7 +25,6 @@ else()
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
     add_subdirectory(batched)
-    add_subdirectory(batched-bench)
     add_subdirectory(speculative)
     add_subdirectory(parallel)
     add_subdirectory(embd-input)

     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
     add_subdirectory(batched)
     add_subdirectory(speculative)
     add_subdirectory(parallel)
     add_subdirectory(embd-input)

examples/batched/batched.cpp CHANGED Viewed

@@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
     ctx_params.seed  = 1234;
     ctx_params.n_ctx = n_kv_req;
     ctx_params.n_batch = std::max(n_len, n_parallel);
-    ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);

     ctx_params.seed  = 1234;
     ctx_params.n_ctx = n_kv_req;
     ctx_params.n_batch = std::max(n_len, n_parallel);
+    ctx_params.n_threads = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);

examples/embd-input/embd-input-lib.cpp CHANGED Viewed

@@ -128,22 +128,21 @@ bool eval_string(struct MyModel * mymodel,const char* str){
 llama_token sampling_id(struct MyModel* mymodel) {
     llama_context* ctx = mymodel->ctx;
     gpt_params params = mymodel->params;
-    llama_sampling_params & sparams = params.sampling_params;
     // int n_ctx = llama_n_ctx(ctx);
     // out of user input, sample next token
-    const float   temp            = sparams.temp;
-    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
-    const float   top_p           = sparams.top_p;
-    const float   tfs_z           = sparams.tfs_z;
-    const float   typical_p       = sparams.typical_p;
     // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
     // const float   repeat_penalty  = params.repeat_penalty;
     // const float   alpha_presence  = params.presence_penalty;
     // const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = sparams.mirostat;
-    const float   mirostat_tau    = sparams.mirostat_tau;
-    const float   mirostat_eta    = sparams.mirostat_eta;
     // const bool    penalize_nl     = params.penalize_nl;
     llama_token id = 0;
@@ -152,7 +151,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
         auto n_vocab = llama_n_vocab(llama_get_model(ctx));
         // Apply params.logit_bias map
-        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
             logits[it->first] += it->second;
         }

 llama_token sampling_id(struct MyModel* mymodel) {
     llama_context* ctx = mymodel->ctx;
     gpt_params params = mymodel->params;
     // int n_ctx = llama_n_ctx(ctx);
     // out of user input, sample next token
+    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
     // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
     // const float   repeat_penalty  = params.repeat_penalty;
     // const float   alpha_presence  = params.presence_penalty;
     // const float   alpha_frequency = params.frequency_penalty;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
     // const bool    penalize_nl     = params.penalize_nl;
     llama_token id = 0;
         auto n_vocab = llama_n_vocab(llama_get_model(ctx));
         // Apply params.logit_bias map
+        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
             logits[it->first] += it->second;
         }

examples/infill/infill.cpp CHANGED Viewed

@@ -104,7 +104,6 @@ static void sigint_handler(int signo) {
 int main(int argc, char ** argv) {
     gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
     g_params = &params;
     if (!gpt_params_parse(argc, argv, params)) {
@@ -207,7 +206,7 @@ int main(int argc, char ** argv) {
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
         ctx_guidance = llama_new_context_with_model(model, lparams);
     }
@@ -234,22 +233,10 @@ int main(int argc, char ** argv) {
     const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
     LOG("add_bos: %d\n", add_bos);
-    bool suff_rm_leading_spc = params.escape;
-    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
-        params.input_suffix.erase(0, 1);
-        suff_rm_leading_spc = false;
-    }
     std::vector<llama_token> embd_inp;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-    const int space_token = 29871;
-    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-        inp_sfx.erase(inp_sfx.begin());
-    }
     inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
-    if (add_bos) {
-        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
-    }
     inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
     embd_inp = inp_pfx;
     embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
@@ -270,9 +257,9 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
         LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@@ -313,7 +300,7 @@ int main(int argc, char ** argv) {
         if (ctx_guidance) {
             LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -359,7 +346,7 @@ int main(int argc, char ** argv) {
         }
     }
     LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
@@ -377,8 +364,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("\n");
         {
-            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
-            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
                 LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
             }
         }
@@ -435,7 +422,6 @@ int main(int argc, char ** argv) {
     const int n_vocab = llama_n_vocab(model);
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
@@ -554,7 +540,7 @@ int main(int argc, char ** argv) {
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
             last_tokens.erase(last_tokens.begin());
             last_tokens.push_back(id);
@@ -641,27 +627,10 @@ int main(int argc, char ** argv) {
                 buffer.clear();
                 // done taking input, reset color
                 console::set_display(console::reset);
-                if (params.escape) {
-                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
-                    process_escapes(params.input_prefix);
-                    process_escapes(params.input_suffix);
-                }
-                suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
-                    params.input_suffix.erase(0, 1);
-                    suff_rm_leading_spc = false;
-                }
                 // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-                    inp_sfx.erase(inp_sfx.begin());
-                }
                 inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
-                if (add_bos) {
-                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
-                }
                 inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
                 embd_inp = inp_pfx;
                 embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());

 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
     if (!gpt_params_parse(argc, argv, params)) {
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (params.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
         ctx_guidance = llama_new_context_with_model(model, lparams);
     }
     const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
     LOG("add_bos: %d\n", add_bos);
     std::vector<llama_token> embd_inp;
+    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
+    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
     inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
     inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
     embd_inp = inp_pfx;
     embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
         LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
         if (ctx_guidance) {
             LOG_TEE("\n");
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
         }
     }
     LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
         LOG_TEE("\n");
         {
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
+            if (it != params.logit_bias.end() && it->second == -INFINITY) {
                 LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
             }
         }
     const int n_vocab = llama_n_vocab(model);
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
             last_tokens.erase(last_tokens.begin());
             last_tokens.push_back(id);
                 buffer.clear();
                 // done taking input, reset color
                 console::set_display(console::reset);
                 // tokenize new prefix and suffix
+                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
+                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
                 inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
                 inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
                 embd_inp = inp_pfx;
                 embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());

examples/main/main.cpp CHANGED Viewed

@@ -109,7 +109,6 @@ int main(int argc, char ** argv) {
     if (!gpt_params_parse(argc, argv, params)) {
         return 1;
     }
-    llama_sampling_params & sparams = params.sampling_params;
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
@@ -180,7 +179,7 @@ int main(int argc, char ** argv) {
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
         ctx_guidance = llama_new_context_with_model(model, lparams);
     }
@@ -258,9 +257,9 @@ int main(int argc, char ** argv) {
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
         LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@@ -297,9 +296,6 @@ int main(int argc, char ** argv) {
             LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                 __func__, n_matching_session_tokens, embd_inp.size());
         }
-        // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
     }
     LOGLN(
@@ -347,7 +343,7 @@ int main(int argc, char ** argv) {
         if (ctx_guidance) {
             LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -399,7 +395,7 @@ int main(int argc, char ** argv) {
         }
     }
     LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
@@ -417,8 +413,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("\n");
         {
-            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
-            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
                 LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
             }
         }
@@ -473,7 +469,6 @@ int main(int argc, char ** argv) {
     const int n_vocab = llama_n_vocab(model);
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
@@ -548,6 +543,9 @@ int main(int argc, char ** argv) {
                 if (i > 0) {
                     embd.erase(embd.begin(), embd.begin() + i);
                 }
             }
             // evaluate tokens in batches
@@ -627,7 +625,7 @@ int main(int argc, char ** argv) {
                 LOG("saved session to %s\n", path_session.c_str());
             }
-            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
             last_tokens.erase(last_tokens.begin());
             last_tokens.push_back(id);

     if (!gpt_params_parse(argc, argv, params)) {
         return 1;
     }
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (params.cfg_scale > 1.f) {
         struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
         ctx_guidance = llama_new_context_with_model(model, lparams);
     }
     int guidance_offset = 0;
     int original_prompt_len = 0;
     if (ctx_guidance) {
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
         LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
         std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
             LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                 __func__, n_matching_session_tokens, embd_inp.size());
         }
     }
     LOGLN(
         if (ctx_guidance) {
             LOG_TEE("\n");
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
                 LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
         }
     }
     LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
         LOG_TEE("\n");
         {
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
+            if (it != params.logit_bias.end() && it->second == -INFINITY) {
                 LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
             }
         }
     const int n_vocab = llama_n_vocab(model);
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
                 if (i > 0) {
                     embd.erase(embd.begin(), embd.begin() + i);
                 }
+                // remove any "future" tokens that we might have inherited from the session from the KV cache
+                llama_kv_cache_tokens_rm(ctx, n_past, -1);
             }
             // evaluate tokens in batches
                 LOG("saved session to %s\n", path_session.c_str());
             }
+            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
             last_tokens.erase(last_tokens.begin());
             last_tokens.push_back(id);

examples/parallel/parallel.cpp CHANGED Viewed

@@ -125,8 +125,6 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
         printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -169,7 +167,7 @@ int main(int argc, char ** argv) {
     // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
     // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0);
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
@@ -341,7 +339,7 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
-                const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
@@ -386,7 +384,7 @@ int main(int argc, char ** argv) {
                     n_total_prompt += client.n_prompt;
                     n_total_gen    += client.n_decoded;
-                    llama_sampling_context_reset(ctx_sampling, client.seq_id);
                     client.seq_id = -1;
                 }

     params.logits_all = true;
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
         printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
     // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
     // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
+    llama_batch batch = llama_batch_init(params.n_ctx, 0);
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
+                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
                     n_total_prompt += client.n_prompt;
                     n_total_gen    += client.n_decoded;
                     client.seq_id = -1;
                 }

examples/save-load-state/save-load-state.cpp CHANGED Viewed

@@ -8,10 +8,9 @@
 int main(int argc, char ** argv) {
     gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
     params.seed = 42;
     params.n_threads = 4;
-    sparams.repeat_last_n = 64;
     params.prompt = "The quick brown fox";
     if (!gpt_params_parse(argc, argv, params)) {
@@ -25,7 +24,7 @@ int main(int argc, char ** argv) {
     }
     auto n_past = 0;
-    auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
     // init
     llama_model * model;

 int main(int argc, char ** argv) {
     gpt_params params;
     params.seed = 42;
     params.n_threads = 4;
+    params.repeat_last_n = 64;
     params.prompt = "The quick brown fox";
     if (!gpt_params_parse(argc, argv, params)) {
     }
     auto n_past = 0;
+    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
     // init
     llama_model * model;

examples/server/index.html.hpp CHANGED Viewed

The diff for this file is too large to render. See raw diff

examples/server/public/index.html CHANGED Viewed

@@ -136,11 +136,6 @@
       display: block;
     }
-    fieldset label.slim {
-      margin: 0 0.5em;
-      display: inline;
-    }
     header, footer {
       text-align: center;
     }
@@ -150,14 +145,6 @@
       color: #888;
     }
-    .mode-chat textarea[name=prompt] {
-      height: 4.5em;
-    }
-    .mode-completion textarea[name=prompt] {
-      height: 10em;
-    }
     @keyframes loading-bg-wipe {
       0% {
@@ -200,7 +187,7 @@
       template: "{{prompt}}\n\n{{history}}\n{{char}}:",
       historyTemplate: "{{name}}: {{message}}",
       transcript: [],
-      type: "chat",  // "chat" | "completion"
       char: "Llama",
       user: "User",
     })
@@ -378,44 +365,13 @@
       return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
     }
-    async function runLlama(prompt, llamaParams, char) {
-      const currentMessages = [];
-      const history = session.value.transcript;
-      if (controller.value) {
-        throw new Error("already running");
-      }
-      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
-        const data = chunk.data;
-        if (data.stop) {
-          while (
-            currentMessages.length > 0 &&
-            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
-            ) {
-            currentMessages.pop();
-          }
-          transcriptUpdate([...history, [char, currentMessages]])
-          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
-        } else {
-          currentMessages.push(data);
-          transcriptUpdate([...history, [char, currentMessages]])
-        }
-        if (data.timings) {
-          llamaStats.value = data.timings;
-        }
-      }
-      controller.value = null;
-    }
     // send message to server
     const chat = async (msg) => {
       if (controller.value) {
         console.log('already running...');
         return;
       }
       transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
@@ -435,41 +391,55 @@
         ).join("\n"),
       });
-      await runLlama(prompt, {
         ...params.value,
         stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
-      }, "{{char}}");
-    }
-    const runCompletion = async () => {
-      if (controller.value) {
-        console.log('already running...');
-        return;
       }
-      const {prompt} = session.value;
-      transcriptUpdate([...session.value.transcript, ["", prompt]]);
-      await runLlama(prompt, {
-        ...params.value,
-        stop: [],
-      }, "");
-    }
-    const stop = (e) => {
-      e.preventDefault();
-      if (controller.value) {
-        controller.value.abort();
-        controller.value = null;
       }
-    }
-    const reset = (e) => {
-      stop(e);
-      transcriptUpdate([]);
     }
     function MessageInput() {
       const message = useSignal("")
       const submit = (e) => {
         stop(e);
         chat(message.value);
@@ -504,19 +474,6 @@
       `
     }
-    function CompletionControls() {
-      const submit = (e) => {
-        stop(e);
-        runCompletion();
-      }
-      return html`
-        <div>
-          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
-          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
-          <button onclick=${reset}>Reset</button>
-        </div>`;
-    }
     const ChatLog = (props) => {
       const messages = session.value.transcript;
       const container = useRef(null)
@@ -540,11 +497,7 @@
             data;
           message = html`<${Markdownish} text=${template(text)} />`
         }
-        if(user) {
-          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
-        } else {
-          return html`<p key=${index}>${message}</p>`
-        }
       };
       return html`
@@ -621,31 +574,18 @@
         userTemplateAutosave()
       }, [session.value, params.value])
-      const GrammarControl = () => (
-        html`
-          <div>
-            <label for="template">Grammar</label>
-            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
-            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
-            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
-          </div>
-          `
-      );
-      const PromptControlFieldSet = () => (
-        html`
-        <fieldset>
-          <div>
-            <label htmlFor="prompt">Prompt</label>
-            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
-          </div>
-        </fieldset>
-        `
-      );
-      const ChatConfigForm = () => (
-        html`
-          ${PromptControlFieldSet()}
           <fieldset class="two">
             <div>
@@ -669,30 +609,15 @@
               <label for="template">Chat history template</label>
               <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
             </div>
-            ${GrammarControl()}
-          </fieldset>
-      `
-    );
-      const CompletionConfigForm = () => (
-        html`
-          ${PromptControlFieldSet()}
-          <fieldset>${GrammarControl()}</fieldset>
-        `
-      );
-      return html`
-        <form>
-          <fieldset class="two">
-            <${UserTemplateResetButton}/>
             <div>
-              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
-              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
             </div>
           </fieldset>
-          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
           <fieldset class="two">
             ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
             ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
@@ -926,7 +851,7 @@
     function App(props) {
       return html`
-        <div class="mode-${session.value.type}">
           <header>
             <h1>llama.cpp</h1>
           </header>
@@ -936,7 +861,7 @@
           </main>
           <section id="write">
-            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
           </section>
           <footer>

       display: block;
     }
     header, footer {
       text-align: center;
     }
       color: #888;
     }
     @keyframes loading-bg-wipe {
       0% {
       template: "{{prompt}}\n\n{{history}}\n{{char}}:",
       historyTemplate: "{{name}}: {{message}}",
       transcript: [],
+      type: "chat",
       char: "Llama",
       user: "User",
     })
       return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
     }
     // send message to server
     const chat = async (msg) => {
       if (controller.value) {
         console.log('already running...');
         return;
       }
+      controller.value = new AbortController();
       transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
         ).join("\n"),
       });
+      const currentMessages = [];
+      const history = session.value.transcript
+      const llamaParams = {
         ...params.value,
         stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
       }
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
+        const data = chunk.data;
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, ["{{char}}", currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          transcriptUpdate([...history, ["{{char}}", currentMessages]])
+        }
+        if (data.timings) {
+          llamaStats.value = data.timings;
+        }
       }
+      controller.value = null;
     }
     function MessageInput() {
       const message = useSignal("")
+      const stop = (e) => {
+        e.preventDefault();
+        if (controller.value) {
+          controller.value.abort();
+          controller.value = null;
+        }
+      }
+      const reset = (e) => {
+        stop(e);
+        transcriptUpdate([]);
+      }
       const submit = (e) => {
         stop(e);
         chat(message.value);
       `
     }
     const ChatLog = (props) => {
       const messages = session.value.transcript;
       const container = useRef(null)
             data;
           message = html`<${Markdownish} text=${template(text)} />`
         }
+        return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
       };
       return html`
         userTemplateAutosave()
       }, [session.value, params.value])
+      return html`
+        <form>
+          <fieldset>
+            <${UserTemplateResetButton}/>
+          </fieldset>
+          <fieldset>
+            <div>
+              <label for="prompt">Prompt</label>
+              <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
+            </div>
+          </fieldset>
           <fieldset class="two">
             <div>
               <label for="template">Chat history template</label>
               <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
             </div>
             <div>
+              <label for="template">Grammar</label>
+              <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+              <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+              <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
             </div>
           </fieldset>
           <fieldset class="two">
             ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
             ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
     function App(props) {
       return html`
+        <div>
           <header>
             <h1>llama.cpp</h1>
           </header>
           </main>
           <section id="write">
+            <${MessageInput} />
           </section>
           <footer>

examples/server/server.cpp CHANGED Viewed

@@ -200,7 +200,6 @@ struct llama_server_context
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
     gpt_params params;
-    llama_sampling_context ctx_sampling;
     int n_ctx;
     grammar_parser::parse_state parsed_grammar;
@@ -255,7 +254,6 @@ struct llama_server_context
         if (grammar != nullptr) {
             llama_grammar_free(grammar);
             grammar = nullptr;
-            ctx_sampling = llama_sampling_context_init(params, NULL);
         }
     }
@@ -331,8 +329,8 @@ struct llama_server_context
             grammar_parser::print_grammar(stderr, parsed_grammar);
             {
-                auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
-                if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
                     LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
                 }
             }
@@ -341,26 +339,14 @@ struct llama_server_context
             grammar = llama_grammar_init(
                 grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
         }
-        ctx_sampling = llama_sampling_context_init(params, grammar);
         return true;
     }
     void loadInfill()
     {
-        bool suff_rm_leading_spc = true;
-        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
-            params.input_suffix.erase(0, 1);
-            suff_rm_leading_spc = false;
-        }
-        auto prefix_tokens = tokenize(params.input_prefix, false);
-        auto suffix_tokens = tokenize(params.input_suffix, false);
-        const int space_token = 29871;
-        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
-            suffix_tokens.erase(suffix_tokens.begin());
-        }
         prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
         prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
         prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
         prefix_tokens.push_back(llama_token_middle(ctx));
@@ -405,7 +391,6 @@ struct llama_server_context
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
             // we have to evaluate at least 1 token to generate logits.
@@ -413,9 +398,6 @@ struct llama_server_context
             n_past--;
         }
-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
         LOG_VERBOSE("prompt ingested", {
                                            {"n_past", n_past},
                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -465,6 +447,9 @@ struct llama_server_context
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
@@ -472,9 +457,6 @@ struct llama_server_context
             n_past--;
         }
-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
         LOG_VERBOSE("prompt ingested", {
                                            {"n_past", n_past},
                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -557,12 +539,12 @@ struct llama_server_context
             std::vector<llama_token_data> candidates;
             candidates.reserve(llama_n_vocab(model));
-            result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
             llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-            const int32_t n_probs = params.sampling_params.n_probs;
-            if (params.sampling_params.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
                 llama_sample_softmax(ctx, &candidates_p);
@@ -637,7 +619,7 @@ struct llama_server_context
         const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
         generated_text += token_text;
-        if (params.sampling_params.n_probs > 0)
         {
             generated_token_probs.push_back(token_with_probs);
         }
@@ -718,16 +700,15 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("usage: %s [options]\n", argv0);
     printf("\n");
     printf("options:\n");
-    printf("  -h, --help                show this help message and exit\n");
-    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N,  --threads N        number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -c N,  --ctx-size N       size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N       RoPE frequency scaling factor (default: loaded from model)\n");
-    printf("  -b N,  --batch-size N     batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
     if (llama_mlock_supported())
     {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@@ -872,15 +853,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.n_threads = std::stoi(argv[i]);
         }
-        else if (arg == "--threads-batch" || arg == "-tb")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch = std::stoi(argv[i]);
-        }
         else if (arg == "-b" || arg == "--batch-size")
         {
             if (++i >= argc)
@@ -1035,35 +1007,34 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto & sparams = llama.params.sampling_params;
-    const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
-    const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
                             eos_bias->second < 0.0f && std::isinf(eos_bias->second);
     return json{
         {"n_ctx", llama.n_ctx},
         {"model", llama.params.model_alias},
         {"seed", llama.params.seed},
-        {"temp", sparams.temp},
-        {"top_k", sparams.top_k},
-        {"top_p", sparams.top_p},
-        {"tfs_z", sparams.tfs_z},
-        {"typical_p", sparams.typical_p},
-        {"repeat_last_n", sparams.repeat_last_n},
-        {"repeat_penalty", sparams.repeat_penalty},
-        {"presence_penalty", sparams.presence_penalty},
-        {"frequency_penalty", sparams.frequency_penalty},
-        {"mirostat", sparams.mirostat},
-        {"mirostat_tau", sparams.mirostat_tau},
-        {"mirostat_eta", sparams.mirostat_eta},
-        {"penalize_nl", sparams.penalize_nl},
         {"stop", llama.params.antiprompt},
         {"n_predict", llama.params.n_predict},
         {"n_keep", llama.params.n_keep},
         {"ignore_eos", ignore_eos},
         {"stream", llama.stream},
-        {"logit_bias", sparams.logit_bias},
-        {"n_probs", sparams.n_probs},
         {"grammar", llama.params.grammar},
     };
 }
@@ -1112,7 +1083,7 @@ static json format_final_response(llama_server_context &llama, const std::string
         {"timings", format_timings(llama)},
     };
-    if (llama.params.sampling_params.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
@@ -1128,7 +1099,7 @@ static json format_partial_response(
         {"stop", false},
     };
-    if (llama.params.sampling_params.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
@@ -1160,28 +1131,26 @@ static T json_value(const json &body, const std::string &key, const T &default_v
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
     gpt_params default_params;
-    const auto & default_sparams = default_params.sampling_params;
-    auto & sparams = llama.params.sampling_params;
     llama.stream = json_value(body, "stream", false);
     llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
-    sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
-    sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
-    sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
-    sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
-    sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
-    sparams.temp = json_value(body, "temperature", default_sparams.temp);
-    sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
-    sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
-    sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
-    sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
-    sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
     llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
     llama.params.seed = json_value(body, "seed", default_params.seed);
     llama.params.grammar = json_value(body, "grammar", default_params.grammar);
-    sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
     if (body.count("prompt") != 0)
     {
@@ -1192,10 +1161,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
         llama.prompt = "";
     }
-    sparams.logit_bias.clear();
     if (json_value(body, "ignore_eos", false))
     {
-        sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
     }
     const auto &logit_bias = body.find("logit_bias");
@@ -1211,11 +1180,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
                 {
                     if (el[1].is_number())
                     {
-                        sparams.logit_bias[tok] = el[1].get<float>();
                     }
                     else if (el[1].is_boolean() && !el[1].get<bool>())
                     {
-                        sparams.logit_bias[tok] = -INFINITY;
                     }
                 }
             }
@@ -1235,8 +1204,6 @@ static void parse_options_completion(const json &body, llama_server_context &lla
         }
     }
-    llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
     LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }
@@ -1445,7 +1412,7 @@ int main(int argc, char **argv)
             }
             auto probs = llama.generated_token_probs;
-            if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
                 const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
                 probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
             }
@@ -1497,7 +1464,7 @@ int main(int argc, char **argv)
                         std::vector<completion_token_output> probs_output = {};
-                        if (llama.params.sampling_params.n_probs > 0) {
                             const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                             size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                             size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1618,7 +1585,7 @@ int main(int argc, char **argv)
                     std::vector<completion_token_output> probs_output = {};
-                    if (llama.params.sampling_params.n_probs > 0) {
                         const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                         size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                         size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());

     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
     gpt_params params;
     int n_ctx;
     grammar_parser::parse_state parsed_grammar;
         if (grammar != nullptr) {
             llama_grammar_free(grammar);
             grammar = nullptr;
         }
     }
             grammar_parser::print_grammar(stderr, parsed_grammar);
             {
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.logit_bias.end() && it->second == -INFINITY) {
                     LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
                 }
             }
             grammar = llama_grammar_init(
                 grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
         }
         return true;
     }
     void loadInfill()
     {
+        auto prefix_tokens = tokenize(params.input_prefix, true);  // always add BOS
+        auto suffix_tokens = tokenize(params.input_suffix, true);  // always add BOS
         prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
         prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
         prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
         prefix_tokens.push_back(llama_token_middle(ctx));
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
             // we have to evaluate at least 1 token to generate logits.
             n_past--;
         }
         LOG_VERBOSE("prompt ingested", {
                                            {"n_past", n_past},
                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
             n_past--;
         }
         LOG_VERBOSE("prompt ingested", {
                                            {"n_past", n_past},
                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
             std::vector<llama_token_data> candidates;
             candidates.reserve(llama_n_vocab(model));
+            result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
             llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            const int32_t n_probs = params.n_probs;
+            if (params.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
                 llama_sample_softmax(ctx, &candidates_p);
         const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
         generated_text += token_text;
+        if (params.n_probs > 0)
         {
             generated_token_probs.push_back(token_with_probs);
         }
     printf("usage: %s [options]\n", argv0);
     printf("\n");
     printf("options:\n");
+    printf("  -h, --help            show this help message and exit\n");
+    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     if (llama_mlock_supported())
     {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
             }
             params.n_threads = std::stoi(argv[i]);
         }
         else if (arg == "-b" || arg == "--batch-size")
         {
             if (++i >= argc)
 static json format_generation_settings(llama_server_context &llama)
 {
+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
+    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
                             eos_bias->second < 0.0f && std::isinf(eos_bias->second);
     return json{
         {"n_ctx", llama.n_ctx},
         {"model", llama.params.model_alias},
         {"seed", llama.params.seed},
+        {"temp", llama.params.temp},
+        {"top_k", llama.params.top_k},
+        {"top_p", llama.params.top_p},
+        {"tfs_z", llama.params.tfs_z},
+        {"typical_p", llama.params.typical_p},
+        {"repeat_last_n", llama.params.repeat_last_n},
+        {"repeat_penalty", llama.params.repeat_penalty},
+        {"presence_penalty", llama.params.presence_penalty},
+        {"frequency_penalty", llama.params.frequency_penalty},
+        {"mirostat", llama.params.mirostat},
+        {"mirostat_tau", llama.params.mirostat_tau},
+        {"mirostat_eta", llama.params.mirostat_eta},
+        {"penalize_nl", llama.params.penalize_nl},
         {"stop", llama.params.antiprompt},
         {"n_predict", llama.params.n_predict},
         {"n_keep", llama.params.n_keep},
         {"ignore_eos", ignore_eos},
         {"stream", llama.stream},
+        {"logit_bias", llama.params.logit_bias},
+        {"n_probs", llama.params.n_probs},
         {"grammar", llama.params.grammar},
     };
 }
         {"timings", format_timings(llama)},
     };
+    if (llama.params.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
         {"stop", false},
     };
+    if (llama.params.n_probs > 0)
     {
         res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
     }
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
     gpt_params default_params;
     llama.stream = json_value(body, "stream", false);
     llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
+    llama.params.top_k = json_value(body, "top_k", default_params.top_k);
+    llama.params.top_p = json_value(body, "top_p", default_params.top_p);
+    llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
+    llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
+    llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
+    llama.params.temp = json_value(body, "temperature", default_params.temp);
+    llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
+    llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
+    llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
+    llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
+    llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
+    llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
+    llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
     llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
     llama.params.seed = json_value(body, "seed", default_params.seed);
     llama.params.grammar = json_value(body, "grammar", default_params.grammar);
+    llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
     if (body.count("prompt") != 0)
     {
         llama.prompt = "";
     }
+    llama.params.logit_bias.clear();
     if (json_value(body, "ignore_eos", false))
     {
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
     }
     const auto &logit_bias = body.find("logit_bias");
                 {
                     if (el[1].is_number())
                     {
+                        llama.params.logit_bias[tok] = el[1].get<float>();
                     }
                     else if (el[1].is_boolean() && !el[1].get<bool>())
                     {
+                        llama.params.logit_bias[tok] = -INFINITY;
                     }
                 }
             }
         }
     }
     LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }
             }
             auto probs = llama.generated_token_probs;
+            if (llama.params.n_probs > 0 && llama.stopped_word) {
                 const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
                 probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
             }
                         std::vector<completion_token_output> probs_output = {};
+                        if (llama.params.n_probs > 0) {
                             const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                             size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                             size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
                     std::vector<completion_token_output> probs_output = {};
+                    if (llama.params.n_probs > 0) {
                         const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                         size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                         size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());

examples/speculative/speculative.cpp CHANGED Viewed

@@ -125,8 +125,6 @@ int main(int argc, char ** argv) {
         grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     }
-    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
     const auto t_dec_start = ggml_time_us();
     while (true) {
@@ -136,7 +134,7 @@ int main(int argc, char ** argv) {
         while (true) {
             // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
             // remember which tokens were sampled - used for repetition penalties during sampling
             last_tokens.erase(last_tokens.begin());
@@ -213,13 +211,7 @@ int main(int argc, char ** argv) {
             if (grammar_dft) {
                 llama_grammar_free(grammar_dft);
             }
-            // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
-            //       that will need to change.
-            auto it = ctx_sampling.sequence_contexts.find(0);
-            GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
-            // This is necessary because each sequence id in sequence_contexts
-            // uses a copy of the original grammar.
-            grammar_dft = llama_grammar_copy(it->second.grammar);
             LOG("copied target grammar to draft grammar\n");
         }

         grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     }
     const auto t_dec_start = ggml_time_us();
     while (true) {
         while (true) {
             // sample from the target model
+            llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
             // remember which tokens were sampled - used for repetition penalties during sampling
             last_tokens.erase(last_tokens.begin());
             if (grammar_dft) {
                 llama_grammar_free(grammar_dft);
             }
+            grammar_dft = llama_grammar_copy(grammar_tgt);
             LOG("copied target grammar to draft grammar\n");
         }

ggml-alloc.c CHANGED Viewed

@@ -1,5 +1,4 @@
 #include "ggml-alloc.h"
-#include "ggml-backend.h"
 #include "ggml.h"
 #include <assert.h>
 #include <stdarg.h>
@@ -7,6 +6,25 @@
 #include <stdlib.h>
 #include <string.h>
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -62,9 +80,8 @@ struct free_block {
 #define MAX_FREE_BLOCKS 256
 struct ggml_allocr {
-    struct ggml_backend_buffer * buffer;
-    bool buffer_owned;
     void * data;
     size_t alignment;
     int n_free_blocks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -102,9 +119,16 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
-    return tensor->buffer == alloc->buffer;
 }
 static bool ggml_is_view(struct ggml_tensor * t) {
@@ -112,10 +136,11 @@ static bool ggml_is_view(struct ggml_tensor * t) {
 }
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
     GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -163,8 +188,6 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     tensor->data = addr;
     AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
-    tensor->buffer = alloc->buffer;
-    ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
 #ifdef GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
@@ -185,21 +208,19 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     if (ggml_allocr_is_own(alloc, tensor) == false) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it
-        AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
         return;
     }
-    void * ptr = tensor->data;
-    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
-    ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
@@ -264,18 +285,15 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
     alloc->n_free_blocks = 1;
     size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
     alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
-    alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
 }
 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
     *alloc = (struct ggml_allocr){
-        /*.buffer        = */ buffer,
-        /*.buffer_owned  = */ true,
-        /*.base          = */ ggml_backend_buffer_get_base(buffer),
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
@@ -294,26 +312,74 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
     return alloc;
 }
-struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
-    struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
-    alloc->measure = true;
-    return alloc;
 }
-struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
     *alloc = (struct ggml_allocr){
-        /*.buffer        = */ buffer,
-        /*.buffer_owned  = */ false,
-        /*.base          = */ ggml_backend_buffer_get_base(buffer),
-        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
         /*.hash_table    = */ {{0}},
         /*.max_size      = */ 0,
-        /*.measure       = */ false,
         /*.parse_seq     = */ {0},
         /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -327,8 +393,8 @@ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * bu
 }
 void ggml_allocr_free(struct ggml_allocr * alloc) {
-    if (alloc->buffer_owned) {
-        ggml_backend_buffer_free(alloc->buffer);
     }
     free(alloc);
 }
@@ -371,6 +437,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
         case GGML_OP_ROPE:
         case GGML_OP_RMS_NORM:
         case GGML_OP_SOFT_MAX:
             return true;
         default:
@@ -378,23 +445,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
     }
 }
-static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
-    assert(view->view_src != NULL && view->view_src->data != NULL);
-    view->backend = view->view_src->backend;
-    view->buffer  = view->view_src->buffer;
-    view->data    = (char *)view->view_src->data + view->view_offs;
-    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
-    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
-    assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
-    ggml_backend_buffer_init_tensor(alloc->buffer, view);
-}
 static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
     struct hash_node * ht = alloc->hash_table;
     if (node->data == NULL) {
         if (ggml_is_view(node)) {
-            init_view(alloc, node);
         } else {
             // see if we can reuse a parent's buffer (inplace)
             if (ggml_op_can_inplace(node->op)) {
@@ -422,17 +478,13 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                                 // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
                                 // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
                                 AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                                node->view_src = view_src;
-                                view_src_hn->n_views += 1;
-                                init_view(alloc, node);
                                 return;
                             }
                         }
                         else {
                             AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                            node->view_src = parent;
-                            p_hn->n_views += 1;
-                            init_view(alloc, node);
                             return;
                         }
                     }
@@ -443,7 +495,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
     }
 }
-size_t ggml_allocr_alloc_graph_n(
     struct ggml_allocr * alloc,
     struct ggml_cgraph ** graphs, int n_graphs,
     struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -461,10 +513,6 @@ size_t ggml_allocr_alloc_graph_n(
             if (ggml_is_view(node)) {
                 struct ggml_tensor * view_src = node->view_src;
                 hash_get(ht, view_src)->n_views += 1;
-                if (node->buffer == NULL && node->data != NULL) {
-                    // view of a pre-allocated tensor, didn't call init_view() yet
-                    init_view(alloc, node);
-                }
             }
             for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -473,9 +521,6 @@ size_t ggml_allocr_alloc_graph_n(
                     break;
                 }
                 hash_get(ht, parent)->n_children += 1;
-                if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                    init_view(alloc, parent);
-                }
             }
         }
     }
@@ -586,7 +631,7 @@ size_t ggml_allocr_alloc_graph_n(
 }
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
 }
 size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {

 #include "ggml-alloc.h"
 #include "ggml.h"
 #include <assert.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/types.h>
+            #include <sys/mman.h>
+        #endif
+    #endif
+#endif
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <memoryapi.h>
+#endif
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MAX_FREE_BLOCKS 256
 struct ggml_allocr {
     void * data;
+    size_t size;
     size_t alignment;
     int n_free_blocks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
 }
 #endif
+static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+    UNUSED(alloc);
+}
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
+    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
 }
 static bool ggml_is_view(struct ggml_tensor * t) {
 }
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+#ifdef GGML_ALLOCATOR_DEBUG
     GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
     GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
+#endif
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
     tensor->data = addr;
     AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
 #ifdef GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
     if (ggml_allocr_is_own(alloc, tensor) == false) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it
         return;
     }
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
     alloc->n_free_blocks = 1;
     size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
     alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
+    alloc->free_blocks[0].size = alloc->size - align_offset;
 }
 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
     *alloc = (struct ggml_allocr){
+        /*.data          = */ data,
+        /*.size          = */ size,
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
     return alloc;
 }
+// OS specific functions to allocate and free uncommitted virtual memory
+static void * alloc_vmem(size_t size) {
+#if defined(_WIN32)
+    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
+#elif defined(_POSIX_MAPPED_FILES)
+    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (ptr == MAP_FAILED) {
+        return NULL;
+    }
+    return ptr;
+#else
+    // use a fixed address for other platforms
+    uintptr_t base_addr = (uintptr_t)-size - 0x100;
+    return (void *)base_addr;
+#endif
+}
+static void free_vmem(void * base_addr, size_t size) {
+#if defined(_WIN32)
+    VirtualFree(base_addr, 0, MEM_RELEASE);
+    UNUSED(size);
+#elif defined(_POSIX_MAPPED_FILES)
+    munmap(base_addr, size);
+#else
+    // nothing to do
+    UNUSED(base_addr);
+    UNUSED(size);
+#endif
 }
+// allocate uncommitted virtual memory to measure the size of the graph
+static void alloc_measure_vmem(void ** base_addr, size_t * size) {
+    // 128GB for 64-bit, 1GB for 32-bit
+    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
+    do {
+        *base_addr = alloc_vmem(*size);
+        if (*base_addr != NULL) {
+            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
+            return;
+        }
+        // try again with half the size
+        *size /= 2;
+    } while (*size > 0);
+    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
+}
+static void free_measure_vmem(void * base_addr, size_t size) {
+    free_vmem(base_addr, size);
+}
+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+    void * base_addr;
+    size_t size;
+    alloc_measure_vmem(&base_addr, &size);
     *alloc = (struct ggml_allocr){
+        /*.data          = */ base_addr,
+        /*.size          = */ size,
+        /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
         /*.hash_table    = */ {{0}},
         /*.max_size      = */ 0,
+        /*.measure       = */ true,
         /*.parse_seq     = */ {0},
         /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
 }
 void ggml_allocr_free(struct ggml_allocr * alloc) {
+    if (alloc->measure) {
+        free_measure_vmem(alloc->data, alloc->size);
     }
     free(alloc);
 }
         case GGML_OP_ROPE:
         case GGML_OP_RMS_NORM:
         case GGML_OP_SOFT_MAX:
+        case GGML_OP_CONT:
             return true;
         default:
     }
 }
 static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
     struct hash_node * ht = alloc->hash_table;
     if (node->data == NULL) {
         if (ggml_is_view(node)) {
+            assert(node->view_src->data != NULL);
+            node->data = (char *)node->view_src->data + node->view_offs;
         } else {
             // see if we can reuse a parent's buffer (inplace)
             if (ggml_op_can_inplace(node->op)) {
                                 // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
                                 // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
                                 AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+                                node->data = parent->data;
                                 return;
                             }
                         }
                         else {
                             AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                            node->data = parent->data;
                             return;
                         }
                     }
     }
 }
+static size_t ggml_allocr_alloc_graph_tensors_n(
     struct ggml_allocr * alloc,
     struct ggml_cgraph ** graphs, int n_graphs,
     struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
             if (ggml_is_view(node)) {
                 struct ggml_tensor * view_src = node->view_src;
                 hash_get(ht, view_src)->n_views += 1;
             }
             for (int j = 0; j < GGML_MAX_SRC; j++) {
                     break;
                 }
                 hash_get(ht, parent)->n_children += 1;
             }
         }
     }
 }
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
+    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
 size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {

ggml-alloc.h CHANGED Viewed

@@ -6,27 +6,21 @@
 extern "C" {
 #endif
-struct ggml_backend_buffer;
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
-GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
 GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
-GGML_API void   ggml_allocr_free       (struct ggml_allocr * alloc);
-GGML_API bool   ggml_allocr_is_measure (struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_reset      (struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_alloc      (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
-GGML_API size_t ggml_allocr_max_size   (struct ggml_allocr * alloc);
-GGML_API size_t ggml_allocr_alloc_graph_n(
-                    struct ggml_allocr * alloc,
-                    struct ggml_cgraph ** graphs, int n_graphs,
-                    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
 #ifdef  __cplusplus
 }

 extern "C" {
 #endif
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
 GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
+GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
+GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
 #ifdef  __cplusplus
 }

ggml-cuda.cu CHANGED Viewed

@@ -62,7 +62,6 @@
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -415,13 +414,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_CPY_BLOCK_SIZE 32
 #define CUDA_SCALE_BLOCK_SIZE 256
-#define CUDA_CLAMP_BLOCK_SIZE 256
 #define CUDA_ROPE_BLOCK_SIZE 256
 #define CUDA_ALIBI_BLOCK_SIZE 32
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_CUDA_DMMV_X
@@ -1577,34 +1574,6 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
     reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
-    const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    if (col >= ncols) {
-        return;
-    }
-    const int r = y[row];
-    // copy x[r*ncols + col] to dst[row*ncols + col]
-    const int xi = r*ncols + col;
-    const int di = row*ncols + col;
-    const int ib = xi/qk; // block index
-    const int iqs = (xi%qk)/qr; // quant index
-    const int iybs = di - di%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(x, ib, iqs, v);
-    dst[iybs + iqs + 0]        = v.x;
-    dst[iybs + iqs + y_offset] = v.y;
-}
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
     const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -4586,24 +4555,6 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
-static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-template<int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
-    const dim3 block_nums(block_num_x, nrows, 1);
-    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
-}
 static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
     const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
     add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5485,11 +5436,6 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }
-static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
-}
 template<typename T>
 static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
                           const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@@ -5753,7 +5699,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
         GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
         kind = cudaMemcpyDeviceToDevice;
-        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
         int id;
         CUDA_CHECK(cudaGetDevice(&id));
         src_ptr = (char *) extra->data_device[id];
@@ -5789,107 +5735,6 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     }
 }
-static void ggml_cuda_op_repeat(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-    const int nr0 = (int)(ne0/ne00);
-    const int nr1 = (int)(ne1/ne01);
-    const int nr2 = (int)(ne2/ne02);
-    const int nr3 = (int)(ne3/ne03);
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-    // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
-    for                         (int i3 = 0; i3 < nr3;  i3++) {
-        for                     (int k3 = 0; k3 < ne03; k3++) {
-            for                 (int i2 = 0; i2 < nr2;  i2++) {
-                for             (int k2 = 0; k2 < ne02; k2++) {
-                    for         (int i1 = 0; i1 < nr1;  i1++) {
-                        for     (int k1 = 0; k1 < ne01; k1++) {
-                            for (int i0 = 0; i0 < nr0;  i0++) {
-                                CUDA_CHECK(cudaMemcpyAsync(
-                                              (char *)  dst_d + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0,
-                                        (const char *) src0_d + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01,
-                                        ne00*nb0, cudaMemcpyDeviceToDevice, stream));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    (void) src1;
-    (void) src1_d;
-}
-static void ggml_cuda_op_get_rows(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    const int ncols = src0->ne[0];
-    const int nrows = ggml_nelements(src1);
-    const int32_t * src1_i32 = (const int32_t *) src1_d;
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
-            break;
-        case GGML_TYPE_F32:
-            get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
-            break;
-        case GGML_TYPE_Q4_0:
-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
-            break;
-        default:
-            // TODO: k-quants
-            GGML_ASSERT(false);
-            break;
-    }
-}
 inline void ggml_cuda_op_add(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6430,12 +6275,12 @@ inline void ggml_cuda_op_alibi(
     const int64_t ne02 = src0->ne[2];
     const int64_t nrows = ggml_nrows(src0);
-    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-    //GGML_ASSERT(ne01 + n_past == ne00);
     GGML_ASSERT(n_head == ne02);
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -6494,14 +6339,7 @@ inline void ggml_cuda_op_scale(
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    float scale;
-    // HACK: support for ggml backend interface
-    if (src1->backend == GGML_BACKEND_CPU) {
-        scale = ((float *) src1->data)[0];
-    } else {
-        // TODO: pass pointer to kernel instead of copying to host
-        CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
-    }
     scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
     CUDA_CHECK(cudaGetLastError());
@@ -6511,24 +6349,6 @@ inline void ggml_cuda_op_scale(
     (void) src1_dd;
 }
-inline void ggml_cuda_op_clamp(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    const float min = ((float *) dst->op_params)[0];
-    const float max = ((float *) dst->op_params)[1];
-    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
-    CUDA_CHECK(cudaGetLastError());
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
 static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
     const int64_t nrows0 = ggml_nrows(src0);
@@ -6538,9 +6358,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
     GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
     GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
-    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
     const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6681,9 +6501,9 @@ static void ggml_cuda_op_mul_mat(
     const size_t q8_1_ts = sizeof(block_q8_1);
     const size_t q8_1_bs = QK8_1;
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
     const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6761,7 +6581,7 @@ static void ggml_cuda_op_mul_mat(
         if (convert_src1_to_q8_1) {
             src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
-            if (src1_on_device && src1_is_contiguous) {
                 quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                 CUDA_CHECK(cudaGetLastError());
             }
@@ -6843,7 +6663,7 @@ static void ggml_cuda_op_mul_mat(
                     GGML_ASSERT(false);
                 }
-                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
                     quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     CUDA_CHECK(cudaGetLastError());
                 }
@@ -6934,14 +6754,6 @@ static void ggml_cuda_op_mul_mat(
     }
 }
-static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
-}
-static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
-}
 static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
 }
@@ -6996,13 +6808,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device];
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
     ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -7027,13 +6839,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device];
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
     const int64_t row_stride_x = nb01 / sizeof(half);
@@ -7054,11 +6866,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
-    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
     } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F32) {
         ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -7090,10 +6902,6 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
 }
-static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
-}
 static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
@@ -7123,8 +6931,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
     char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -7179,8 +6987,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
     const size_t nb1 = tensor->nb[1];
-    ggml_backend_type backend = tensor->backend;
-    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
     for (int64_t id = 0; id < g_device_count; ++id) {
@@ -7234,6 +7042,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
             CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
         }
         CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
         extra->data_device[id] = buf;
@@ -7272,17 +7081,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
     delete extra;
 }
-static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
 static size_t g_temp_tensor_extra_index = 0;
-static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
     if (g_temp_tensor_extras == nullptr) {
         g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
     }
     size_t alloc_index = g_temp_tensor_extra_index;
     g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
-    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
     memset(extra, 0, sizeof(*extra));
     return extra;
@@ -7310,7 +7119,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
         return;
     }
-    ggml_tensor_extra_gpu * extra;
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW ||
@@ -7319,7 +7128,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
@@ -7328,7 +7137,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
         extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
-        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
         void * src1_ddv = src1_extra->data_device[g_main_device];
         extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src1_ddv;
@@ -7370,13 +7179,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
         CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
     }
-    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW;
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t view_offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
@@ -7394,7 +7203,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
     GGML_ASSERT(ggml_is_contiguous(tensor));
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
 }
@@ -7451,47 +7260,58 @@ void ggml_cuda_free_scratch() {
     g_scratch_buffer = nullptr;
 }
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     ggml_cuda_func_t func;
     const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
         || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
         || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
-    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
-        return false;
-    }
     switch (tensor->op) {
-        case GGML_OP_REPEAT:
-            func = ggml_cuda_repeat;
-            break;
-        case GGML_OP_GET_ROWS:
-            func = ggml_cuda_get_rows;
-            break;
         case GGML_OP_DUP:
             func = ggml_cuda_dup;
             break;
         case GGML_OP_ADD:
             func = ggml_cuda_add;
             break;
         case GGML_OP_MUL:
             func = ggml_cuda_mul;
             break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(tensor)) {
                 case GGML_UNARY_OP_GELU:
                     func = ggml_cuda_gelu;
                     break;
                 case GGML_UNARY_OP_SILU:
                     func = ggml_cuda_silu;
                     break;
                 default:
                     return false;
             } break;
         case GGML_OP_NORM:
             func = ggml_cuda_norm;
             break;
         case GGML_OP_RMS_NORM:
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
@@ -7501,36 +7321,54 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_mul_mat;
             break;
         case GGML_OP_SCALE:
-            func = ggml_cuda_scale;
-            break;
-        case GGML_OP_CLAMP:
             if (!any_on_device) {
                 return false;
             }
-            func = ggml_cuda_clamp;
             break;
         case GGML_OP_CPY:
             func = ggml_cuda_cpy;
             break;
         case GGML_OP_CONT:
             func = ggml_cuda_dup;
             break;
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
             func = ggml_cuda_nop;
             break;
         case GGML_OP_DIAG_MASK_INF:
             func = ggml_cuda_diag_mask_inf;
             break;
         case GGML_OP_SOFT_MAX:
             func = ggml_cuda_soft_max;
             break;
         case GGML_OP_ROPE:
             func = ggml_cuda_rope;
             break;
         case GGML_OP_ALIBI:
             func = ggml_cuda_alibi;
             break;
         default:
@@ -7558,263 +7396,3 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
     snprintf(description, description_size, "%s", prop.name);
 }
-////////////////////////////////////////////////////////////////////////////////
-// backend interface
-#define UNUSED GGML_UNUSED
-struct ggml_backend_context_cuda {
-};
-static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-    return GGML_CUDA_NAME;
-    UNUSED(backend);
-}
-static void ggml_backend_cuda_free(ggml_backend_t backend) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-    delete cuda_ctx;
-    delete backend;
-}
-struct ggml_backend_buffer_context_cuda {
-    void * device;
-    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
-    size_t temp_tensor_extra_index = 0;
-    ~ggml_backend_buffer_context_cuda() {
-        delete[] temp_tensor_extras;
-    }
-    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-        if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
-        }
-        size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
-        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
-        memset(extra, 0, sizeof(*extra));
-        return extra;
-    }
-};
-static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    CUDA_CHECK(cudaFree(ctx->device));
-    delete ctx;
-}
-static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    return ctx->device;
-}
-static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-    int64_t ne0 = tensor->ne[0];
-    if (ggml_is_quantized(tensor->type)) {
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
-                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
-        }
-    }
-    return size;
-    UNUSED(buffer);
-}
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        assert(tensor->view_src->buffer->backend == buffer->backend);
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
-        return;
-    }
-    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
-    extra->data_device[g_main_device] = tensor->data;
-    tensor->backend = GGML_BACKEND_GPU;
-    tensor->extra = extra;
-    if (ggml_is_quantized(tensor->type)) {
-        // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
-        size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
-        if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
-        }
-    }
-    UNUSED(buffer);
-}
-static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
-    /* .free_buffer    = */ ggml_backend_cuda_buffer_free_buffer,
-    /* .get_base       = */ ggml_backend_cuda_buffer_get_base,
-    /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
-    /* .init_tensor    = */ ggml_backend_cuda_buffer_init_tensor,
-    /* .free_tensor    = */ NULL,
-};
-static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
-    ggml_cuda_set_device(g_main_device);
-    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
-    CUDA_CHECK(cudaMalloc(&ctx->device, size));
-    return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
-}
-static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
-    return 128;
-    UNUSED(backend);
-}
-static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
-    UNUSED(backend);
-}
-static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-    UNUSED(backend);
-}
-static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
-    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
-    UNUSED(backend);
-}
-static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    GGML_ASSERT(!"not implemented");
-    return nullptr;
-    UNUSED(backend);
-    UNUSED(cgraph);
-}
-static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-    UNUSED(backend);
-    UNUSED(plan);
-}
-static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-    UNUSED(backend);
-    UNUSED(plan);
-}
-static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_cuda_set_device(g_main_device);
-    ggml_compute_params params = {};
-    params.type = GGML_TASK_COMPUTE;
-    params.ith = 0;
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-        assert(node->backend == GGML_BACKEND_GPU);
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j] != nullptr) {
-                assert(node->src[j]->backend == GGML_BACKEND_GPU);
-            }
-        }
-        bool ok = ggml_cuda_compute_forward(&params, node);
-        if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-        }
-        GGML_ASSERT(ok);
-#if 0
-        if (node->type == GGML_TYPE_F32) {
-            cudaDeviceSynchronize();
-            std::vector<float> tmp(ggml_nelements(node), 0.0f);
-            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
-            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
-                ggml_type_name(node->src[0]->type),
-                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
-                node->src[0]->name,
-                node->src[1] ? node->src[1]->name : "none");
-            double sum = 0.0;
-            double sq_sum = 0.0;
-            for (int i = 0; i < ggml_nelements(node); i++) {
-                printf("%f ", tmp[i]);
-                sum += tmp[i];
-                sq_sum += tmp[i]*tmp[i];
-            }
-            printf("\n");
-            printf("sum: %f, ", sum);
-            printf("sq_sum: %f\n", sq_sum);
-        }
-#endif
-    }
-    UNUSED(backend);
-}
-static ggml_backend_i cuda_backend_i = {
-    /* .get_name            = */ ggml_backend_cuda_name,
-    /* .free                = */ ggml_backend_cuda_free,
-    /* .alloc_buffer        = */ ggml_backend_cuda_alloc_buffer,
-    /* .get_alignment       = */ ggml_backend_cuda_get_alignment,
-    /* .set_tensor_async    = */ ggml_backend_cuda_set_tensor_async,
-    /* .get_tensor_async    = */ ggml_backend_cuda_get_tensor_async,
-    /* .synchronize         = */ ggml_backend_cuda_synchronize,
-    /* .cpy_tensor_from     = */ nullptr,
-    /* .cpy_tensor_to       = */ nullptr,
-    /* .graph_plan_create   = */ ggml_backend_cuda_graph_plan_create,
-    /* .graph_plan_free     = */ ggml_backend_cuda_graph_plan_free,
-    /* .graph_plan_compute  = */ ggml_backend_cuda_graph_plan_compute,
-    /* .graph_compute       = */ ggml_backend_cuda_graph_compute,
-    /* .supports_op         = */ nullptr,
-};
-ggml_backend_t ggml_backend_cuda_init() {
-    ggml_init_cublas(); // TODO: remove from ggml.c
-    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
-    ggml_backend_t cuda_backend = new ggml_backend {
-        /* .interface = */ cuda_backend_i,
-        /* .context   = */ ctx
-    };
-    return cuda_backend;
-}

 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_CPY_BLOCK_SIZE 32
 #define CUDA_SCALE_BLOCK_SIZE 256
 #define CUDA_ROPE_BLOCK_SIZE 256
 #define CUDA_ALIBI_BLOCK_SIZE 32
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_CUDA_DMMV_X
     reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
     const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
     dst[i] = scale * x[i];
 }
 static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
     const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
     add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }
 template<typename T>
 static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
                           const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
     } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
         GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
         kind = cudaMemcpyDeviceToDevice;
+        struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
         int id;
         CUDA_CHECK(cudaGetDevice(&id));
         src_ptr = (char *) extra->data_device[id];
     }
 }
 inline void ggml_cuda_op_add(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
     const int64_t ne02 = src0->ne[2];
     const int64_t nrows = ggml_nrows(src0);
+    const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+    GGML_ASSERT(ne01 + n_past == ne00);
     GGML_ASSERT(n_head == ne02);
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    const float scale = ((float *) src1->data)[0];
     scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
     CUDA_CHECK(cudaGetLastError());
     (void) src1_dd;
 }
 static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
     const int64_t nrows0 = ggml_nrows(src0);
     GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
     GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
+    struct ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    struct ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
     const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
     const size_t q8_1_ts = sizeof(block_q8_1);
     const size_t q8_1_bs = QK8_1;
+    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    struct ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
     const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
         if (convert_src1_to_q8_1) {
             src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
+            if (split && src1_on_device && src1_is_contiguous) {
                 quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                 CUDA_CHECK(cudaGetLastError());
             }
                     GGML_ASSERT(false);
                 }
+                if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
                     quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     CUDA_CHECK(cudaGetLastError());
                 }
     }
 }
 static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
 }
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device];
+    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
     ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device];
+    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
     const int64_t row_stride_x = nb01 / sizeof(half);
         }
     }
+    if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
     } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
+    }else if (src0->type == GGML_TYPE_F32) {
         ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
 }
 static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+    const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
     char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
     const size_t nb1 = tensor->nb[1];
+    ggml_backend backend = tensor->backend;
+    struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
     for (int64_t id = 0; id < g_device_count; ++id) {
             CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
         }
         CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
         extra->data_device[id] = buf;
     delete extra;
 }
+static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
 static size_t g_temp_tensor_extra_index = 0;
+static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
     if (g_temp_tensor_extras == nullptr) {
         g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
     }
     size_t alloc_index = g_temp_tensor_extra_index;
     g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+    struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
     memset(extra, 0, sizeof(*extra));
     return extra;
         return;
     }
+    struct ggml_tensor_extra_gpu * extra;
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW ||
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
         extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
+        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
         void * src1_ddv = src1_extra->data_device[g_main_device];
         extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src1_ddv;
         CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
     }
+    struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW;
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t view_offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
     GGML_ASSERT(ggml_is_contiguous(tensor));
+    struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
 }
     g_scratch_buffer = nullptr;
 }
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
     ggml_cuda_func_t func;
     const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
         || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
         || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
     switch (tensor->op) {
         case GGML_OP_DUP:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_dup;
             break;
         case GGML_OP_ADD:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_add;
             break;
         case GGML_OP_MUL:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_mul;
             break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(tensor)) {
                 case GGML_UNARY_OP_GELU:
+                    if (!any_on_device) {
+                        return false;
+                    }
                     func = ggml_cuda_gelu;
                     break;
                 case GGML_UNARY_OP_SILU:
+                    if (!any_on_device) {
+                        return false;
+                    }
                     func = ggml_cuda_silu;
                     break;
                 default:
                     return false;
             } break;
         case GGML_OP_NORM:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_norm;
             break;
         case GGML_OP_RMS_NORM:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
             func = ggml_cuda_mul_mat;
             break;
         case GGML_OP_SCALE:
             if (!any_on_device) {
                 return false;
             }
+            func = ggml_cuda_scale;
             break;
         case GGML_OP_CPY:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_cpy;
             break;
         case GGML_OP_CONT:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_dup;
             break;
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_nop;
             break;
         case GGML_OP_DIAG_MASK_INF:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_diag_mask_inf;
             break;
         case GGML_OP_SOFT_MAX:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_soft_max;
             break;
         case GGML_OP_ROPE:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_rope;
             break;
         case GGML_OP_ALIBI:
+            if (!any_on_device) {
+                return false;
+            }
             func = ggml_cuda_alibi;
             break;
         default:
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
     snprintf(description, description_size, "%s", prop.name);
 }

ggml-cuda.h CHANGED Viewed

@@ -1,7 +1,6 @@
 #pragma once
 #include "ggml.h"
-#include "ggml-backend.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
@@ -43,9 +42,6 @@ GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, s
 GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
-// backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
 #ifdef  __cplusplus
 }
 #endif

 #pragma once
 #include "ggml.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 #ifdef  __cplusplus
 }
 #endif

ggml-metal.h CHANGED Viewed

@@ -20,7 +20,6 @@
 #pragma once
 #include "ggml.h"
-#include "ggml-backend.h"
 #include <stddef.h>
 #include <stdbool.h>
@@ -36,15 +35,10 @@ struct ggml_cgraph;
 extern "C" {
 #endif
-//
-// internal API
-// temporary exposed to user-code
-//
 struct ggml_metal_context;
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 // number of command buffers to use
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
@@ -89,17 +83,6 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
-//
-// backend API
-// user-code should use only these functions
-//
-GGML_API ggml_backend_t ggml_backend_metal_init(void);
-GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
-GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 #ifdef __cplusplus
 }
 #endif

 #pragma once
 #include "ggml.h"
 #include <stddef.h>
 #include <stdbool.h>
 extern "C" {
 #endif
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 struct ggml_metal_context;
 // number of command buffers to use
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 #ifdef __cplusplus
 }
 #endif

ggml-metal.m CHANGED Viewed

@@ -779,8 +779,8 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_CONCAT:
                         {
-                            const int64_t nb = ne00;
                             [encoder setComputePipelineState:ctx->pipeline_concat];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@@ -812,7 +812,6 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
                             const int nth = MIN(1024, ne0);
                             [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
                     case GGML_OP_ADD:
@@ -910,10 +909,9 @@ void ggml_metal_graph_compute(
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
-                            const int64_t n = ggml_nelements(dst);
-                            GGML_ASSERT(n % 4 == 0);
-                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
                     case GGML_OP_UNARY:
                         switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -923,10 +921,9 @@ void ggml_metal_graph_compute(
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                 } break;
                             case GGML_UNARY_OP_RELU:
                                 {
@@ -944,10 +941,9 @@ void ggml_metal_graph_compute(
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                 } break;
                             default:
                                 {
@@ -1044,7 +1040,7 @@ void ggml_metal_graph_compute(
                                 !ggml_is_transposed(src0) &&
                                 !ggml_is_transposed(src1) &&
                                 src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 && ne00 >= 64 &&
                                 ne11 > ne11_mm_min) {
                                 //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                 switch (src0->type) {
@@ -1255,8 +1251,6 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_RMS_NORM:
                         {
-                            GGML_ASSERT(ne00 % 4 == 0);
                             float eps;
                             memcpy(&eps, dst->op_params, sizeof(float));
@@ -1299,7 +1293,7 @@ void ggml_metal_graph_compute(
                             const int nth = MIN(1024, ne00);
-                            //const int n_past = ((int32_t *) dst->op_params)[0];
                             const int n_head = ((int32_t *) dst->op_params)[1];
                             float max_bias;
                             memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
@@ -1477,140 +1471,3 @@ preferably one under the recommended max working set size, or else fall back to
     }
 }
-////////////////////////////////////////////////////////////////////////////////
-// backend interface
-static const char * ggml_backend_metal_name(ggml_backend_t backend) {
-    return "Metal";
-    UNUSED(backend);
-}
-static void ggml_backend_metal_free(ggml_backend_t backend) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-    ggml_metal_free(ctx);
-    free(backend);
-}
-static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)buffer->context;
-}
-static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-    UNUSED(buffer);
-}
-static struct ggml_backend_buffer_i metal_backend_buffer_i = {
-    /* .free_buffer    = */ ggml_backend_metal_buffer_free_buffer,
-    /* .get_base       = */ ggml_backend_metal_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
-    /* .init_tensor    = */ NULL, // no initialization required
-    /* .free_tensor    = */ NULL, // no cleanup required
-};
-static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-    void * data = ggml_metal_host_malloc(size);
-    // TODO: set proper name of the buffers
-    ggml_metal_add_buffer(ctx, "backend", data, size, 0);
-    return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
-}
-static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
-    return 32;
-    UNUSED(backend);
-}
-static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    memcpy((char *)tensor->data + offset, data, size);
-    UNUSED(backend);
-}
-static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    memcpy(data, (const char *)tensor->data + offset, size);
-    UNUSED(backend);
-}
-static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
-    UNUSED(backend);
-}
-static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
-    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
-    UNUSED(backend);
-}
-static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
-    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
-    UNUSED(backend);
-}
-static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
-    ggml_metal_graph_compute(metal_ctx, cgraph);
-}
-static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return true;
-    UNUSED(backend);
-    UNUSED(op);
-}
-static struct ggml_backend_i metal_backend_i = {
-    /* .get_name            = */ ggml_backend_metal_name,
-    /* .free                = */ ggml_backend_metal_free,
-    /* .alloc_buffer        = */ ggml_backend_metal_alloc_buffer,
-    /* .get_alignment       = */ ggml_backend_metal_get_alignment,
-    /* .set_tensor_async    = */ ggml_backend_metal_set_tensor_async,
-    /* .get_tensor_async    = */ ggml_backend_metal_get_tensor_async,
-    /* .synchronize         = */ ggml_backend_metal_synchronize,
-    /* .cpy_tensor_from     = */ ggml_backend_metal_cpy_tensor_from,
-    /* .cpy_tensor_to       = */ ggml_backend_metal_cpy_tensor_to,
-    /* .graph_plan_create   = */ NULL, // the metal implementation does not require creating graph plans atm
-    /* .graph_plan_free     = */ NULL,
-    /* .graph_plan_compute  = */ NULL,
-    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
-    /* .supports_op         = */ ggml_backend_metal_supports_op,
-};
-ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
-    ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
-    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
-    *metal_backend = (struct ggml_backend) {
-        /* .interface = */ metal_backend_i,
-        /* .context   = */ ctx,
-    };
-    return metal_backend;
-}
-bool ggml_backend_is_metal(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_metal_name;
-}
-void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-    ggml_metal_set_n_cb(ctx, n_cb);
-}

                         } break;
                     case GGML_OP_CONCAT:
                         {
+                            int64_t nb = ne00;
                             [encoder setComputePipelineState:ctx->pipeline_concat];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                             [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
                             const int nth = MIN(1024, ne0);
                             [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
                     case GGML_OP_ADD:
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+                            const int64_t n = ggml_nelements(dst)/4;
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
                     case GGML_OP_UNARY:
                         switch (ggml_get_unary_op(gf->nodes[i])) {
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                                    const int64_t n = ggml_nelements(dst)/4;
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                 } break;
                             case GGML_UNARY_OP_RELU:
                                 {
                                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                                    const int64_t n = ggml_nelements(dst)/4;
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                 } break;
                             default:
                                 {
                                 !ggml_is_transposed(src0) &&
                                 !ggml_is_transposed(src1) &&
                                 src1t == GGML_TYPE_F32 &&
+                                ne00 % 32 == 0 &&
                                 ne11 > ne11_mm_min) {
                                 //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                 switch (src0->type) {
                         } break;
                     case GGML_OP_RMS_NORM:
                         {
                             float eps;
                             memcpy(&eps, dst->op_params, sizeof(float));
                             const int nth = MIN(1024, ne00);
+                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
                             const int n_head = ((int32_t *) dst->op_params)[1];
                             float max_bias;
                             memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
     }
 }

ggml-metal.metal CHANGED Viewed

@@ -345,11 +345,10 @@ kernel void kernel_rms_norm(
         uint sgitg[[simdgroup_index_in_threadgroup]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x        = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
-    device const float  * x_scalar = (device const float  *) x;
-    float4 sumf = 0;
-    float all_sum = 0;
     // parallel sum
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
@@ -362,7 +361,6 @@ kernel void kernel_rms_norm(
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     // broadcast, simd group number is ntg / 32
     for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
        if (tpitg < i) {
@@ -370,9 +368,7 @@ kernel void kernel_rms_norm(
        }
     }
     if (tpitg == 0) {
-        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
-            sum[0] += x_scalar[i];
-        }
         sum[0] /= ne00;
     }
@@ -387,9 +383,7 @@ kernel void kernel_rms_norm(
         y[i00] = x[i00] * scale;
     }
     if (tpitg == 0) {
-        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
-            y_scalar[i00] = x_scalar[i00] * scale;
-        }
     }
 }

         uint sgitg[[simdgroup_index_in_threadgroup]],
         uint tiisg[[thread_index_in_simdgroup]],
         uint   ntg[[threads_per_threadgroup]]) {
+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
+    device const float * x_scalar = (device const float *) x;
+    float4 sumf=0;
+    float all_sum=0;
     // parallel sum
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
     }
     threadgroup_barrier(mem_flags::mem_threadgroup);
     // broadcast, simd group number is ntg / 32
     for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
        if (tpitg < i) {
        }
     }
     if (tpitg == 0) {
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
         sum[0] /= ne00;
     }
         y[i00] = x[i00] * scale;
     }
     if (tpitg == 0) {
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
     }
 }

ggml.c CHANGED Viewed

@@ -162,16 +162,40 @@ typedef void * thread_ret_t;
 #define GGML_PRINT(...) printf(__VA_ARGS__)
-//
-// end of logging block
-//
 #ifdef GGML_USE_ACCELERATE
 // uncomment to use vDSP for soft max computation
 // note: not sure if it is actually faster
 //#define GGML_SOFT_MAX_ACCELERATE
 #endif
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
@@ -4928,7 +4952,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
     *result = (struct ggml_tensor) {
         /*.type         =*/ type,
         /*.backend      =*/ GGML_BACKEND_CPU,
-        /*.buffer       =*/ NULL,
         /*.n_dims       =*/ n_dims,
         /*.ne           =*/ { 1, 1, 1, 1 },
         /*.nb           =*/ { 0, 0, 0, 0 },
@@ -11234,7 +11257,7 @@ static void ggml_compute_forward_silu_f32(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
             UNUSED(x);
             assert(!isnan(x));
             assert(!isinf(x));
@@ -13060,22 +13083,24 @@ static void ggml_compute_forward_alibi_f32(
         return;
     }
-    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-    const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int64_t ne1 = src0->ne[1]; // seq_len_without_past
-    const int64_t ne2 = src0->ne[2]; // n_head -> this is k
-    //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
-    const int64_t n  = ggml_nrows(src0);
-    const int64_t ne2_ne3 = n/ne1; // ne2*ne3
-    const size_t nb0 = src0->nb[0];
-    const size_t nb1 = src0->nb[1];
-    const size_t nb2 = src0->nb[2];
     //const int nb3 = src0->nb[3];
     GGML_ASSERT(nb0 == sizeof(float));
@@ -13087,9 +13112,9 @@ static void ggml_compute_forward_alibi_f32(
     const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-    for (int64_t i = 0; i < ne0; i++) {
-        for (int64_t j = 0; j < ne1; j++) {
-            for (int64_t k = 0; k < ne2_ne3; k++) {
                 float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
                 float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
@@ -13104,6 +13129,7 @@ static void ggml_compute_forward_alibi_f32(
                 }
                 pdst[0] = i * m_k + src[0];
             }
         }
     }
@@ -20174,10 +20200,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
-        // TODO: instead of passing &cancel here, use the return code of the linesearch
-        //       to determine if the optimization should be cancelled
-        //       this is a simple change, but not doing this atm, since I don't have a nice
-        //       way to test and don't want to break something with so many changes lined up
         ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
         if (cancel) {
             return GGML_OPT_CANCEL;

 #define GGML_PRINT(...) printf(__VA_ARGS__)
 #ifdef GGML_USE_ACCELERATE
 // uncomment to use vDSP for soft max computation
 // note: not sure if it is actually faster
 //#define GGML_SOFT_MAX_ACCELERATE
 #endif
+//
+// logging
+//
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+//
+// end of logging block
+//
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
     *result = (struct ggml_tensor) {
         /*.type         =*/ type,
         /*.backend      =*/ GGML_BACKEND_CPU,
         /*.n_dims       =*/ n_dims,
         /*.ne           =*/ { 1, 1, 1, 1 },
         /*.nb           =*/ { 0, 0, 0, 0 },
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
             UNUSED(x);
             assert(!isnan(x));
             assert(!isinf(x));
         return;
     }
+    const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+    assert(n_past >= 0);
+    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int ne1 = src0->ne[1]; // seq_len_without_past
+    const int ne2 = src0->ne[2]; // n_head -> this is k
+    //const int ne3 = src0->ne[3]; // 1 -> bsz
+    const int n  = ggml_nrows(src0);
+    const int ne2_ne3 = n/ne1; // ne2*ne3
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
     //const int nb3 = src0->nb[3];
     GGML_ASSERT(nb0 == sizeof(float));
     const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+    for (int i = 0; i < ne0; i++) {
+        for (int j = 0; j < ne1; j++) {
+            for (int k = 0; k < ne2_ne3; k++) {
                 float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
                 float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
                 }
                 pdst[0] = i * m_k + src[0];
             }
         }
     }
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
         ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
         if (cancel) {
             return GGML_OPT_CANCEL;

ggml.h CHANGED Viewed

@@ -326,7 +326,7 @@ extern "C" {
         GGML_TYPE_COUNT,
     };
-    enum ggml_backend_type {
         GGML_BACKEND_CPU = 0,
         GGML_BACKEND_GPU = 10,
         GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,10 +479,8 @@ extern "C" {
     // n-dimensional tensor
     struct ggml_tensor {
-        enum ggml_type         type;
-        enum ggml_backend_type backend;
-        struct ggml_backend_buffer * buffer;
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -516,7 +514,7 @@ extern "C" {
         void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[12];
     };
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -1360,7 +1358,7 @@ extern "C" {
     // alibi position embedding
     // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_alibi(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   n_past,
@@ -1369,7 +1367,7 @@ extern "C" {
     // clamp
     // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_clamp(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             float                 min,
@@ -2104,7 +2102,7 @@ extern "C" {
         enum ggml_type    vec_dot_type;
     } ggml_type_traits_t;
-    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 #ifdef  __cplusplus
 }

         GGML_TYPE_COUNT,
     };
+    enum ggml_backend {
         GGML_BACKEND_CPU = 0,
         GGML_BACKEND_GPU = 10,
         GGML_BACKEND_GPU_SPLIT = 20,
     // n-dimensional tensor
     struct ggml_tensor {
+        enum ggml_type    type;
+        enum ggml_backend backend;
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
         void * extra; // extra things e.g. for ggml-cuda.cu
+        char padding[4];
     };
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
     // alibi position embedding
     // in-place, returns view(a)
+    struct ggml_tensor * ggml_alibi(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   n_past,
     // clamp
     // in-place, returns view(a)
+    struct ggml_tensor * ggml_clamp(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             float                 min,
         enum ggml_type    vec_dot_type;
     } ggml_type_traits_t;
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 #ifdef  __cplusplus
 }

gguf-py/gguf/gguf.py CHANGED Viewed

@@ -88,31 +88,29 @@ class MODEL_ARCH(IntEnum):
     PERSIMMON     : int = auto()
     REFACT        : int = auto()
     BERT          : int = auto()
-    BLOOM         : int = auto()
 class MODEL_TENSOR(IntEnum):
-    TOKEN_EMBD      : int = auto()
-    TOKEN_EMBD_NORM : int = auto()
-    TOKEN_TYPES     : int = auto()
-    POS_EMBD        : int = auto()
-    OUTPUT          : int = auto()
-    OUTPUT_NORM     : int = auto()
-    ROPE_FREQS      : int = auto()
-    ATTN_Q          : int = auto()
-    ATTN_K          : int = auto()
-    ATTN_V          : int = auto()
-    ATTN_QKV        : int = auto()
-    ATTN_OUT        : int = auto()
-    ATTN_NORM       : int = auto()
-    ATTN_NORM_2     : int = auto()
-    ATTN_ROT_EMBD   : int = auto()
-    FFN_GATE        : int = auto()
-    FFN_DOWN        : int = auto()
-    FFN_UP          : int = auto()
-    FFN_NORM        : int = auto()
-    ATTN_Q_NORM     : int = auto()
-    ATTN_K_NORM     : int = auto()
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -127,31 +125,29 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.PERSIMMON:      "persimmon",
     MODEL_ARCH.REFACT:         "refact",
     MODEL_ARCH.BERT:           "bert",
-    MODEL_ARCH.BLOOM:          "bloom",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
-    MODEL_TENSOR.TOKEN_EMBD:      "token_embd",
-    MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
-    MODEL_TENSOR.TOKEN_TYPES:     "token_types",
-    MODEL_TENSOR.POS_EMBD:        "position_embd",
-    MODEL_TENSOR.OUTPUT_NORM:     "output_norm",
-    MODEL_TENSOR.OUTPUT:          "output",
-    MODEL_TENSOR.ROPE_FREQS:      "rope_freqs",
-    MODEL_TENSOR.ATTN_NORM:       "blk.{bid}.attn_norm",
-    MODEL_TENSOR.ATTN_NORM_2:     "blk.{bid}.attn_norm_2",
-    MODEL_TENSOR.ATTN_QKV:        "blk.{bid}.attn_qkv",
-    MODEL_TENSOR.ATTN_Q:          "blk.{bid}.attn_q",
-    MODEL_TENSOR.ATTN_K:          "blk.{bid}.attn_k",
-    MODEL_TENSOR.ATTN_V:          "blk.{bid}.attn_v",
-    MODEL_TENSOR.ATTN_OUT:        "blk.{bid}.attn_output",
-    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
-    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
-    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
-    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
-    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
-    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
-    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
 }
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -286,18 +282,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
-    MODEL_ARCH.BLOOM: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.TOKEN_EMBD_NORM,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
     MODEL_ARCH.GPT2: [
         # TODO
     ],
@@ -327,7 +311,6 @@ class TensorNameMap:
             "gpt_neox.embed_in",                        # gptneox
             "transformer.wte",                          # gpt2 gpt-j mpt refact
             "transformer.word_embeddings",              # falcon
-            "word_embeddings",                          # bloom
             "model.embed_tokens",                       # llama-hf
             "tok_embeddings",                           # llama-pth
             "embeddings.word_embeddings",               # bert
@@ -339,11 +322,6 @@ class TensorNameMap:
             "embeddings.token_type_embeddings",  # bert
         ),
-        # Normalization of token embeddings
-        MODEL_TENSOR.TOKEN_EMBD_NORM: (
-            "word_embeddings_layernorm",  # bloom
-        ),
         # Position embeddings
         MODEL_TENSOR.POS_EMBD: (
             "transformer.wpe",                 # gpt2
@@ -354,7 +332,7 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                # gptneox
             "lm_head",                  # gpt2 mpt falcon llama-hf baichuan
-            "output",                   # llama-pth bloom
             "word_embeddings_for_head", # persimmon
         ),
@@ -366,7 +344,7 @@ class TensorNameMap:
             "norm",                                   # llama-pth
             "embeddings.LayerNorm",                   # bert
             "transformer.norm_f",                     # mpt
-            "ln_f",                                   # refact bloom
             "language_model.encoder.final_layernorm", # persimmon
         ),
@@ -383,7 +361,6 @@ class TensorNameMap:
             "transformer.h.{bid}.ln_1",                            # gpt2 gpt-j refact
             "transformer.blocks.{bid}.norm_1",                     # mpt
             "transformer.h.{bid}.input_layernorm",                 # falcon7b
-            "h.{bid}.input_layernorm",                             # bloom
             "transformer.h.{bid}.ln_mlp",                          # falcon40b
             "model.layers.{bid}.input_layernorm",                  # llama-hf
             "layers.{bid}.attention_norm",                         # llama-pth
@@ -402,7 +379,6 @@ class TensorNameMap:
             "transformer.h.{bid}.attn.c_attn",                                    # gpt2
             "transformer.blocks.{bid}.attn.Wqkv",                                 # mpt
             "transformer.h.{bid}.self_attention.query_key_value",                 # falcon
-            "h.{bid}.self_attention.query_key_value",                             # bloom
             "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
         ),
@@ -436,7 +412,6 @@ class TensorNameMap:
             "transformer.h.{bid}.attn.c_proj",                         # gpt2 refact
             "transformer.blocks.{bid}.attn.out_proj",                  # mpt
             "transformer.h.{bid}.self_attention.dense",                # falcon
-            "h.{bid}.self_attention.dense",                            # bloom
             "model.layers.{bid}.self_attn.o_proj",                     # llama-hf
             "layers.{bid}.attention.wo",                               # llama-pth
             "encoder.layer.{bid}.attention.output.dense",              # bert
@@ -454,7 +429,6 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_NORM: (
             "gpt_neox.layers.{bid}.post_attention_layernorm",               # gptneox
             "transformer.h.{bid}.ln_2",                                     # gpt2 refact
-            "h.{bid}.post_attention_layernorm",                             # bloom
             "transformer.blocks.{bid}.norm_2",                              # mpt
             "model.layers.{bid}.post_attention_layernorm",                  # llama-hf
             "layers.{bid}.ffn_norm",                                        # llama-pth
@@ -468,7 +442,6 @@ class TensorNameMap:
             "transformer.h.{bid}.mlp.c_fc",                          # gpt2
             "transformer.blocks.{bid}.ffn.up_proj",                  # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                 # falcon
-            "h.{bid}.mlp.dense_h_to_4h",                             # bloom
             "model.layers.{bid}.mlp.up_proj",                        # llama-hf refact
             "layers.{bid}.feed_forward.w3",                          # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                # bert
@@ -488,7 +461,6 @@ class TensorNameMap:
             "transformer.h.{bid}.mlp.c_proj",                        # gpt2 refact
             "transformer.blocks.{bid}.ffn.down_proj",                # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                 # falcon
-            "h.{bid}.mlp.dense_4h_to_h",                             # bloom
             "model.layers.{bid}.mlp.down_proj",                      # llama-hf
             "layers.{bid}.feed_forward.w2",                          # llama-pth
             "encoder.layer.{bid}.output.dense",                      # bert

     PERSIMMON     : int = auto()
     REFACT        : int = auto()
     BERT          : int = auto()
 class MODEL_TENSOR(IntEnum):
+    TOKEN_EMBD   : int = auto()
+    TOKEN_TYPES  : int = auto()
+    POS_EMBD     : int = auto()
+    OUTPUT       : int = auto()
+    OUTPUT_NORM  : int = auto()
+    ROPE_FREQS   : int = auto()
+    ATTN_Q       : int = auto()
+    ATTN_K       : int = auto()
+    ATTN_V       : int = auto()
+    ATTN_QKV     : int = auto()
+    ATTN_OUT     : int = auto()
+    ATTN_NORM    : int = auto()
+    ATTN_NORM_2  : int = auto()
+    ATTN_ROT_EMBD: int = auto()
+    FFN_GATE     : int = auto()
+    FFN_DOWN     : int = auto()
+    FFN_UP       : int = auto()
+    FFN_NORM     : int = auto()
+    ATTN_Q_NORM  : int = auto()
+    ATTN_K_NORM  : int = auto()
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.PERSIMMON:      "persimmon",
     MODEL_ARCH.REFACT:         "refact",
     MODEL_ARCH.BERT:           "bert",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
+    MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+    MODEL_TENSOR.TOKEN_TYPES:   "token_types",
+    MODEL_TENSOR.POS_EMBD:      "position_embd",
+    MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+    MODEL_TENSOR.OUTPUT:        "output",
+    MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+    MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+    MODEL_TENSOR.ATTN_NORM_2:   "blk.{bid}.attn_norm_2",
+    MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+    MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+    MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+    MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+    MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+    MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+    MODEL_TENSOR.ATTN_Q_NORM:   "blk.{bid}.attn_q_norm",
+    MODEL_TENSOR.ATTN_K_NORM:   "blk.{bid}.attn_k_norm",
+    MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+    MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+    MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
 }
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
     MODEL_ARCH.GPT2: [
         # TODO
     ],
             "gpt_neox.embed_in",                        # gptneox
             "transformer.wte",                          # gpt2 gpt-j mpt refact
             "transformer.word_embeddings",              # falcon
             "model.embed_tokens",                       # llama-hf
             "tok_embeddings",                           # llama-pth
             "embeddings.word_embeddings",               # bert
             "embeddings.token_type_embeddings",  # bert
         ),
         # Position embeddings
         MODEL_TENSOR.POS_EMBD: (
             "transformer.wpe",                 # gpt2
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                # gptneox
             "lm_head",                  # gpt2 mpt falcon llama-hf baichuan
+            "output",                   # llama-pth
             "word_embeddings_for_head", # persimmon
         ),
             "norm",                                   # llama-pth
             "embeddings.LayerNorm",                   # bert
             "transformer.norm_f",                     # mpt
+            "ln_f",                                   # refact
             "language_model.encoder.final_layernorm", # persimmon
         ),
             "transformer.h.{bid}.ln_1",                            # gpt2 gpt-j refact
             "transformer.blocks.{bid}.norm_1",                     # mpt
             "transformer.h.{bid}.input_layernorm",                 # falcon7b
             "transformer.h.{bid}.ln_mlp",                          # falcon40b
             "model.layers.{bid}.input_layernorm",                  # llama-hf
             "layers.{bid}.attention_norm",                         # llama-pth
             "transformer.h.{bid}.attn.c_attn",                                    # gpt2
             "transformer.blocks.{bid}.attn.Wqkv",                                 # mpt
             "transformer.h.{bid}.self_attention.query_key_value",                 # falcon
             "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
         ),
             "transformer.h.{bid}.attn.c_proj",                         # gpt2 refact
             "transformer.blocks.{bid}.attn.out_proj",                  # mpt
             "transformer.h.{bid}.self_attention.dense",                # falcon
             "model.layers.{bid}.self_attn.o_proj",                     # llama-hf
             "layers.{bid}.attention.wo",                               # llama-pth
             "encoder.layer.{bid}.attention.output.dense",              # bert
         MODEL_TENSOR.FFN_NORM: (
             "gpt_neox.layers.{bid}.post_attention_layernorm",               # gptneox
             "transformer.h.{bid}.ln_2",                                     # gpt2 refact
             "transformer.blocks.{bid}.norm_2",                              # mpt
             "model.layers.{bid}.post_attention_layernorm",                  # llama-hf
             "layers.{bid}.ffn_norm",                                        # llama-pth
             "transformer.h.{bid}.mlp.c_fc",                          # gpt2
             "transformer.blocks.{bid}.ffn.up_proj",                  # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                 # falcon
             "model.layers.{bid}.mlp.up_proj",                        # llama-hf refact
             "layers.{bid}.feed_forward.w3",                          # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                # bert
             "transformer.h.{bid}.mlp.c_proj",                        # gpt2 refact
             "transformer.blocks.{bid}.ffn.down_proj",                # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                 # falcon
             "model.layers.{bid}.mlp.down_proj",                      # llama-hf
             "layers.{bid}.feed_forward.w2",                          # llama-pth
             "encoder.layer.{bid}.output.dense",                      # bert

gpttype_adapter.cpp CHANGED Viewed

@@ -1768,7 +1768,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     int realnpredict = params.n_predict-stopper_unused_tokens;
     float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
     float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
-    printf("\nContextLimit: %d/%d, Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
     fflush(stdout);
     output.status = 1;
     generation_finished = true;

     int realnpredict = params.n_predict-stopper_unused_tokens;
     float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
     float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
+    printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)", time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
     fflush(stdout);
     output.status = 1;
     generation_finished = true;

koboldcpp.py CHANGED Viewed

@@ -184,10 +184,6 @@ def init_library():
         os.add_dll_directory(dir_path)
         os.add_dll_directory(abs_path)
         os.add_dll_directory(os.getcwd())
-        if libname == lib_hipblas and "HIP_PATH" in os.environ:
-            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
-            if args.debugmode == 1:
-                print(f"HIP/ROCm SDK at {os.environ['HIP_PATH']} included in .DLL load path")
     handle = ctypes.CDLL(os.path.join(dir_path, libname))
     handle.load_model.argtypes = [load_model_inputs]
@@ -365,7 +361,7 @@ maxhordelen = 256
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.47"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True
@@ -373,8 +369,6 @@ session_kudos_earned = 0
 session_jobs = 0
 session_starttime = None
 exitcounter = 0
-punishcounter = 0 #causes a timeout if too many errors
-rewardcounter = 0 #reduces error counts for successful jobs
 totalgens = 0
 currentusergenkey = "" #store a special key so polled streaming works even in multiuser
 args = None #global args
@@ -418,34 +412,16 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
             elif api_format==4:
                 # translate openai chat completion messages format into one big string.
                 messages_array = genparams.get('messages', [])
-                adapter_obj = genparams.get('adapter', {})
                 messages_string = ""
-                system_message_start = adapter_obj.get("system_start", "\n### Instruction:\n")
-                system_message_end = adapter_obj.get("system_end", "")
-                user_message_start = adapter_obj.get("user_start", "\n### Instruction:\n")
-                user_message_end = adapter_obj.get("user_end", "")
-                assistant_message_start = adapter_obj.get("assistant_start", "\n### Response:\n")
-                assistant_message_end = adapter_obj.get("assistant_end", "")
                 for message in messages_array:
                     if message['role'] == "system":
-                        messages_string += system_message_start
                     elif message['role'] == "user":
-                        messages_string += user_message_start
                     elif message['role'] == "assistant":
-                        messages_string += assistant_message_start
-                    messages_string += message['content']
-                    if message['role'] == "system":
-                        messages_string += system_message_end
-                    elif message['role'] == "user":
-                        messages_string += user_message_end
-                    elif message['role'] == "assistant":
-                        messages_string += assistant_message_end
-                messages_string += assistant_message_start
                 genparams["prompt"] = messages_string
                 frqp = genparams.get('frequency_penalty', 0.1)
                 scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
@@ -521,9 +497,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
     async def handle_sse_stream(self, api_format):
         global friendlymodelname
         self.send_response(200)
-        self.send_header("cache-control", "no-cache")
-        self.send_header("connection", "keep-alive")
-        self.end_headers(content_type='text/event-stream')
         current_token = 0
         incomplete_token_buffer = bytearray()
@@ -590,10 +566,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
         global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
         self.path = self.path.rstrip('/')
         response_body = None
-        content_type = 'application/json'
         if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
-            content_type = 'text/html'
             if self.embedded_kailite is None:
                 response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
             else:
@@ -639,9 +615,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
         elif self.path.endswith('/v1/models'):
             response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
         elif self.path=="/api":
-            content_type = 'text/html'
             if self.embedded_kcpp_docs is None:
                 response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
             else:
@@ -649,40 +625,41 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
         elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
             self.path = "/api"
             self.send_response(302)
-            self.send_header("location", self.path)
-            self.end_headers(content_type='text/html')
             return None
         if response_body is None:
             self.send_response(404)
-            self.end_headers(content_type='text/html')
             rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
             self.wfile.write(rp.encode())
         else:
             self.send_response(200)
-            self.send_header('content-length', str(len(response_body)))
-            self.end_headers(content_type=content_type)
             self.wfile.write(response_body)
         return
     def do_POST(self):
         global modelbusy, requestsinqueue, currentusergenkey, totalgens
-        content_length = int(self.headers['content-length'])
         body = self.rfile.read(content_length)
         self.path = self.path.rstrip('/')
         if self.path.endswith(('/api/extra/tokencount')):
             try:
                 genparams = json.loads(body)
                 countprompt = genparams.get('prompt', "")
                 count = handle.token_count(countprompt.encode("UTF-8"))
                 self.send_response(200)
-                self.end_headers(content_type='application/json')
                 self.wfile.write(json.dumps({"value": count}).encode())
             except ValueError as e:
                 utfprint("Count Tokens - Body Error: " + str(e))
                 self.send_response(400)
-                self.end_headers(content_type='application/json')
                 self.wfile.write(json.dumps({"value": -1}).encode())
             return
@@ -695,11 +672,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                 multiuserkey = ""
                 pass
-            if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
                 ag = handle.abort_generate()
                 time.sleep(0.3) #short delay before replying
                 self.send_response(200)
-                self.end_headers(content_type='application/json')
                 self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
                 print("\nGeneration Aborted")
             else:
@@ -717,11 +694,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                 pass
             if totalgens>0:
-                if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
                     pendtxt = handle.get_pending_output()
                     pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
             self.send_response(200)
-            self.end_headers(content_type='application/json')
             self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
             return
@@ -731,7 +708,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
             requestsinqueue += 1
         if not modelbusy.acquire(blocking=reqblocking):
             self.send_response(503)
-            self.end_headers(content_type='application/json')
             self.wfile.write(json.dumps({"detail": {
                     "msg": "Server is busy; please try again later.",
                     "type": "service_unavailable",
@@ -757,9 +734,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
             if self.path.endswith('/v1/completions'):
                 api_format = 3
             if self.path.endswith('/v1/chat/completions'):
                 api_format = 4
             if api_format > 0:
                 genparams = None
@@ -785,8 +764,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                     # Headers are already sent when streaming
                     if not sse_stream_flag:
                         self.send_response(200)
-                        self.end_headers(content_type='application/json')
-                        self.wfile.write(json.dumps(gen).encode())
                 except:
                     print("Generate: The response could not be sent, maybe connection was terminated?")
                 return
@@ -794,23 +773,27 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
             modelbusy.release()
         self.send_response(404)
-        self.end_headers(content_type='text/html')
     def do_OPTIONS(self):
         self.send_response(200)
-        self.end_headers(content_type='text/html')
     def do_HEAD(self):
         self.send_response(200)
-        self.end_headers(content_type='text/html')
-    def end_headers(self, content_type=None):
-        self.send_header('access-control-allow-origin', '*')
-        self.send_header('access-control-allow-methods', '*')
-        self.send_header('access-control-allow-headers', '*, Accept, Content-Type, Content-Length, Accept-Encoding, X-CSRF-Token, Client-Agent, X-Fields, Content-Type, Authorization, X-Requested-With, X-HTTP-Method-Override, apikey, genkey')
-        if content_type is not None:
-            self.send_header('content-type', content_type)
         return super(ServerRequestHandler, self).end_headers()
@@ -1034,8 +1017,7 @@ def show_new_gui():
     mmq_var = ctk.IntVar(value=1)
     blas_threads_var = ctk.StringVar()
     blas_size_var = ctk.IntVar()
-    version_var = ctk.StringVar(value="0")
-    tensor_split_str_vars = ctk.StringVar(value="")
     smartcontext = ctk.IntVar()
     context_var = ctk.IntVar()
@@ -1087,15 +1069,11 @@ def show_new_gui():
             quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1,  stick="nw")
             mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
             quick_mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
-            tensor_split_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
-            tensor_split_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
         else:
             lowvram_box.grid_forget()
             quick_lowvram_box.grid_forget()
             mmq_box.grid_forget()
             quick_mmq_box.grid_forget()
-            tensor_split_label.grid_forget()
-            tensor_split_entry.grid_forget()
         if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
             gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
@@ -1108,7 +1086,6 @@ def show_new_gui():
             quick_gpu_layers_label.grid_forget()
             quick_gpu_layers_entry.grid_forget()
     # presets selector
     makelabel(quick_tab, "Presets:", 1)
@@ -1141,7 +1118,7 @@ def show_new_gui():
     makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
     # load model
-    makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170)
     # Hardware Tab
     hardware_tab = tabcontent["Hardware"]
@@ -1160,7 +1137,6 @@ def show_new_gui():
     gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
     CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
     gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
-    tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 6, 80)
     lowvram_box = makecheckbox(hardware_tab,  "Low VRAM", lowvram_var, 4,0)
     mmq_box = makecheckbox(hardware_tab,  "Use QuantMatMul (mmq)", mmq_var, 4,1)
@@ -1209,7 +1185,7 @@ def show_new_gui():
     # Model Tab
     model_tab = tabcontent["Model"]
-    makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1)
     makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
     makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
@@ -1289,12 +1265,6 @@ def show_new_gui():
             args.noavx2 = True
             args.noblas = True
             args.nommap = True
-        if tensor_split_str_vars.get()!="":
-            tssv = tensor_split_str_vars.get()
-            if "," in tssv:
-                args.tensor_split = [float(x) for x in tssv.split(",")]
-            else:
-                args.tensor_split = [float(x) for x in tssv.split(" ")]
         args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
@@ -1359,9 +1329,6 @@ def show_new_gui():
                 runopts_var.set(openblas_option)
         if "gpulayers" in dict and dict["gpulayers"]:
             gpulayers_var.set(dict["gpulayers"])
-        if "tensor_split" in dict and dict["tensor_split"]:
-            tssep = ','.join(map(str, dict["tensor_split"]))
-            tensor_split_str_vars.set(tssep)
         if "blasthreads" in dict and dict["blasthreads"]:
             blas_threads_var.set(str(dict["blasthreads"]))
         else:
@@ -1480,7 +1447,7 @@ def show_gui_msgbox(title,message):
 def run_horde_worker(args, api_key, worker_name):
     import urllib.request
     from datetime import datetime
-    global friendlymodelname, maxhordectx, maxhordelen, exitcounter, punishcounter, modelbusy, session_starttime
     epurl = f"http://localhost:{args.port}"
     if args.host!="":
         epurl = f"http://{args.host}:{args.port}"
@@ -1489,11 +1456,10 @@ def run_horde_worker(args, api_key, worker_name):
         print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
     def submit_completed_generation(url, jobid, sessionstart, submit_dict):
-        global exitcounter, punishcounter, session_kudos_earned, session_jobs, rewardcounter
         reply = make_url_request(url, submit_dict)
         if not reply:
             exitcounter += 1
-            punishcounter += 1
             print_with_time(f"Error, Job submit failed.")
         else:
             reward = reply["reward"]
@@ -1507,11 +1473,6 @@ def run_horde_worker(args, api_key, worker_name):
             elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
             earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
             print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
-            rewardcounter += 1
-            if rewardcounter > 50:
-                rewardcounter = 0
-                if exitcounter > 5:
-                    exitcounter -= 1
     def make_url_request(url, data, method='POST'):
         try:
@@ -1520,7 +1481,7 @@ def run_horde_worker(args, api_key, worker_name):
             if method=='POST':
                 json_payload = json.dumps(data).encode('utf-8')
                 request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
-                request.add_header('content-type', 'application/json')
             else:
                 request = urllib.request.Request(url, headers=headers, method=method)
             response_data = ""
@@ -1547,23 +1508,17 @@ def run_horde_worker(args, api_key, worker_name):
     print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
     BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
     cluster = "https://horde.koboldai.net"
-    while exitcounter < 35:
         time.sleep(3)
         readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
         if readygo:
             print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
             break
-    while exitcounter < 35:
         currentjob_attempts = 0
         current_generation = None
-        if punishcounter >= 10:
-            punishcounter = 0
-            print_with_time(f"Horde Worker Paused for 10 min - Too many errors. It will resume automatically.")
-            print_with_time(f"Caution: Too many failed jobs may lead to entering maintenance mode.")
-            time.sleep(600)
         #first, make sure we are not generating
         if modelbusy.locked():
             time.sleep(0.2)
@@ -1582,7 +1537,6 @@ def run_horde_worker(args, api_key, worker_name):
         pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
         if not pop:
             exitcounter += 1
-            punishcounter += 1
             print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
             time.sleep(5)
             continue
@@ -1601,7 +1555,7 @@ def run_horde_worker(args, api_key, worker_name):
         print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
         #do gen
-        while exitcounter < 35:
             if not modelbusy.locked():
                 current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
                 if current_generation:
@@ -1926,10 +1880,4 @@ if __name__ == '__main__':
     parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
     parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
-    # #deprecated hidden args. they do nothing. do not use
-    # parser.add_argument("--psutil_set_threads", action='store_true', help=argparse.SUPPRESS)
-    # parser.add_argument("--stream", action='store_true', help=argparse.SUPPRESS)
-    # parser.add_argument("--unbantokens", action='store_true', help=argparse.SUPPRESS)
-    # parser.add_argument("--usemirostat", action='store_true', help=argparse.SUPPRESS)
     main(parser.parse_args(),start_server=True)

         os.add_dll_directory(dir_path)
         os.add_dll_directory(abs_path)
         os.add_dll_directory(os.getcwd())
     handle = ctypes.CDLL(os.path.join(dir_path, libname))
     handle.load_model.argtypes = [load_model_inputs]
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
+KcppVersion = "1.46.1"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True
 session_jobs = 0
 session_starttime = None
 exitcounter = 0
 totalgens = 0
 currentusergenkey = "" #store a special key so polled streaming works even in multiuser
 args = None #global args
             elif api_format==4:
                 # translate openai chat completion messages format into one big string.
                 messages_array = genparams.get('messages', [])
                 messages_string = ""
                 for message in messages_array:
                     if message['role'] == "system":
+                        messages_string+="\n### Instruction:\n"
                     elif message['role'] == "user":
+                        messages_string+="\n### Instruction:\n"
                     elif message['role'] == "assistant":
+                        messages_string+="\n### Response:\n"
+                    messages_string+=message['content']
+                messages_string += "\n### Response:\n"
                 genparams["prompt"] = messages_string
                 frqp = genparams.get('frequency_penalty', 0.1)
                 scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
     async def handle_sse_stream(self, api_format):
         global friendlymodelname
         self.send_response(200)
+        self.send_header("Cache-Control", "no-cache")
+        self.send_header("Connection", "keep-alive")
+        self.end_headers(force_json=True, sse_stream_flag=True)
         current_token = 0
         incomplete_token_buffer = bytearray()
         global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
         self.path = self.path.rstrip('/')
         response_body = None
+        force_json = False
         if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
             if self.embedded_kailite is None:
                 response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
             else:
         elif self.path.endswith('/v1/models'):
             response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
+            force_json = True
         elif self.path=="/api":
             if self.embedded_kcpp_docs is None:
                 response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
             else:
         elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
             self.path = "/api"
             self.send_response(302)
+            self.send_header("Location", self.path)
+            self.end_headers()
             return None
         if response_body is None:
             self.send_response(404)
+            self.end_headers()
             rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
             self.wfile.write(rp.encode())
         else:
             self.send_response(200)
+            self.send_header('Content-Length', str(len(response_body)))
+            self.end_headers(force_json=force_json)
             self.wfile.write(response_body)
         return
     def do_POST(self):
         global modelbusy, requestsinqueue, currentusergenkey, totalgens
+        content_length = int(self.headers['Content-Length'])
         body = self.rfile.read(content_length)
         self.path = self.path.rstrip('/')
+        force_json = False
         if self.path.endswith(('/api/extra/tokencount')):
             try:
                 genparams = json.loads(body)
                 countprompt = genparams.get('prompt', "")
                 count = handle.token_count(countprompt.encode("UTF-8"))
                 self.send_response(200)
+                self.end_headers()
                 self.wfile.write(json.dumps({"value": count}).encode())
             except ValueError as e:
                 utfprint("Count Tokens - Body Error: " + str(e))
                 self.send_response(400)
+                self.end_headers()
                 self.wfile.write(json.dumps({"value": -1}).encode())
             return
                 multiuserkey = ""
                 pass
+            if (multiuserkey!="" and multiuserkey==currentusergenkey) or requestsinqueue==0:
                 ag = handle.abort_generate()
                 time.sleep(0.3) #short delay before replying
                 self.send_response(200)
+                self.end_headers()
                 self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
                 print("\nGeneration Aborted")
             else:
                 pass
             if totalgens>0:
+                if (multiuserkey!="" and multiuserkey==currentusergenkey) or requestsinqueue==0:
                     pendtxt = handle.get_pending_output()
                     pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
             self.send_response(200)
+            self.end_headers()
             self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
             return
             requestsinqueue += 1
         if not modelbusy.acquire(blocking=reqblocking):
             self.send_response(503)
+            self.end_headers()
             self.wfile.write(json.dumps({"detail": {
                     "msg": "Server is busy; please try again later.",
                     "type": "service_unavailable",
             if self.path.endswith('/v1/completions'):
                 api_format = 3
+                force_json = True
             if self.path.endswith('/v1/chat/completions'):
                 api_format = 4
+                force_json = True
             if api_format > 0:
                 genparams = None
                     # Headers are already sent when streaming
                     if not sse_stream_flag:
                         self.send_response(200)
+                        self.end_headers(force_json=force_json)
+                    self.wfile.write(json.dumps(gen).encode())
                 except:
                     print("Generate: The response could not be sent, maybe connection was terminated?")
                 return
             modelbusy.release()
         self.send_response(404)
+        self.end_headers()
     def do_OPTIONS(self):
         self.send_response(200)
+        self.end_headers()
     def do_HEAD(self):
         self.send_response(200)
+        self.end_headers()
+    def end_headers(self, force_json=False, sse_stream_flag=False):
+        self.send_header('Access-Control-Allow-Origin', '*')
+        self.send_header('Access-Control-Allow-Methods', '*')
+        self.send_header('Access-Control-Allow-Headers', '*')
+        if ("/api" in self.path and self.path!="/api") or force_json:
+            if sse_stream_flag:
+                self.send_header('Content-type', 'text/event-stream')
+            self.send_header('Content-type', 'application/json')
+        else:
+            self.send_header('Content-type', 'text/html')
         return super(ServerRequestHandler, self).end_headers()
     mmq_var = ctk.IntVar(value=1)
     blas_threads_var = ctk.StringVar()
     blas_size_var = ctk.IntVar()
+    version_var =ctk.StringVar(value="0")
     smartcontext = ctk.IntVar()
     context_var = ctk.IntVar()
             quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1,  stick="nw")
             mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
             quick_mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
         else:
             lowvram_box.grid_forget()
             quick_lowvram_box.grid_forget()
             mmq_box.grid_forget()
             quick_mmq_box.grid_forget()
         if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
             gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
             quick_gpu_layers_label.grid_forget()
             quick_gpu_layers_entry.grid_forget()
     # presets selector
     makelabel(quick_tab, "Presets:", 1)
     makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
     # load model
+    makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170,filetypes=[("GGML Model Files", "*.gguf;*.bin;*.ggml")])
     # Hardware Tab
     hardware_tab = tabcontent["Hardware"]
     gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
     CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
     gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
     lowvram_box = makecheckbox(hardware_tab,  "Low VRAM", lowvram_var, 4,0)
     mmq_box = makecheckbox(hardware_tab,  "Use QuantMatMul (mmq)", mmq_var, 4,1)
     # Model Tab
     model_tab = tabcontent["Model"]
+    makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1, filetypes=[("GGML Model Files", "*.gguf;*.bin;*.ggml")])
     makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
     makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
             args.noavx2 = True
             args.noblas = True
             args.nommap = True
         args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
                 runopts_var.set(openblas_option)
         if "gpulayers" in dict and dict["gpulayers"]:
             gpulayers_var.set(dict["gpulayers"])
         if "blasthreads" in dict and dict["blasthreads"]:
             blas_threads_var.set(str(dict["blasthreads"]))
         else:
 def run_horde_worker(args, api_key, worker_name):
     import urllib.request
     from datetime import datetime
+    global friendlymodelname, maxhordectx, maxhordelen, exitcounter, modelbusy, session_starttime
     epurl = f"http://localhost:{args.port}"
     if args.host!="":
         epurl = f"http://{args.host}:{args.port}"
         print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
     def submit_completed_generation(url, jobid, sessionstart, submit_dict):
+        global exitcounter, session_kudos_earned, session_jobs
         reply = make_url_request(url, submit_dict)
         if not reply:
             exitcounter += 1
             print_with_time(f"Error, Job submit failed.")
         else:
             reward = reply["reward"]
             elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
             earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
             print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
     def make_url_request(url, data, method='POST'):
         try:
             if method=='POST':
                 json_payload = json.dumps(data).encode('utf-8')
                 request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
+                request.add_header('Content-Type', 'application/json')
             else:
                 request = urllib.request.Request(url, headers=headers, method=method)
             response_data = ""
     print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
     BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
     cluster = "https://horde.koboldai.net"
+    while exitcounter < 10:
         time.sleep(3)
         readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
         if readygo:
             print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
             break
+    while exitcounter < 10:
         currentjob_attempts = 0
         current_generation = None
         #first, make sure we are not generating
         if modelbusy.locked():
             time.sleep(0.2)
         pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
         if not pop:
             exitcounter += 1
             print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
             time.sleep(5)
             continue
         print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
         #do gen
+        while exitcounter < 10:
             if not modelbusy.locked():
                 current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
                 if current_generation:
     parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
     parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
     main(parser.parse_args(),start_server=True)

llama.cpp CHANGED Viewed

@@ -189,7 +189,6 @@ enum llm_arch {
     LLM_ARCH_STARCODER,
     LLM_ARCH_PERSIMMON,
     LLM_ARCH_REFACT,
-    LLM_ARCH_BLOOM,
     LLM_ARCH_UNKNOWN,
 };
@@ -203,8 +202,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_BAICHUAN,        "baichuan"  },
     { LLM_ARCH_STARCODER,       "starcoder" },
     { LLM_ARCH_PERSIMMON,       "persimmon" },
-    { LLM_ARCH_REFACT,          "refact"    },
-    { LLM_ARCH_BLOOM,           "bloom"     },
 };
 enum llm_kv {
@@ -307,7 +305,6 @@ struct LLM_KV {
 enum llm_tensor {
     LLM_TENSOR_TOKEN_EMBD,
-    LLM_TENSOR_TOKEN_EMBD_NORM,
     LLM_TENSOR_POS_EMBD,
     LLM_TENSOR_OUTPUT,
     LLM_TENSOR_OUTPUT_NORM,
@@ -428,14 +425,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
         LLM_ARCH_MPT,
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
     {
@@ -470,21 +459,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
-    {
-        LLM_ARCH_BLOOM,
-        {
-            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
-            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
-            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
-            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
-            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
-            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
-        },
-    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -1042,9 +1016,6 @@ struct llama_hparams {
     float rope_freq_base_train;
     float rope_freq_scale_train;
-    float f_clamp_kqv;
-    float f_max_alibi_bias;
     bool operator!=(const llama_hparams & other) const {
         if (this->vocab_only != other.vocab_only) return true;
         if (this->n_vocab != other.n_vocab) return true;
@@ -1230,8 +1201,6 @@ struct llama_model {
     struct ggml_tensor * tok_embeddings;
     struct ggml_tensor * pos_embeddings;
-    struct ggml_tensor * tok_norm;
-    struct ggml_tensor * tok_norm_b;
     struct ggml_tensor * output_norm;
     struct ggml_tensor * output_norm_b;
@@ -1361,11 +1330,7 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(n_ctx);
-    // TODO: this should be:
-    //       cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
-    //       change it and test that it works
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-    memset(cache.buf.data, 0, cache.buf.size);
     struct ggml_init_params params;
     params.mem_size   = cache.buf.size;
@@ -1771,7 +1736,7 @@ struct llama_model_loader {
         }
     }
-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
         if (backend != GGML_BACKEND_CPU) {
             ggml_set_no_alloc(ctx, true);
         }
@@ -1789,7 +1754,7 @@ struct llama_model_loader {
         return tensor;
     }
-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
         struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
         if (cur == NULL) {
@@ -2082,13 +2047,13 @@ static void llm_load_hparams(
                 }
             } break;
         case LLM_ARCH_PERSIMMON:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-                switch (hparams.n_layer) {
-                    case 36: model.type = e_model::MODEL_8B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
         case LLM_ARCH_REFACT:
             {
                 GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -2097,33 +2062,6 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
-        case LLM_ARCH_BLOOM:
-            {
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-                switch (hparams.n_layer) {
-                    case 24: model.type = e_model::MODEL_1B; break;
-                    case 30:
-                        switch (hparams.n_embd) {
-                            case 2560: model.type = e_model::MODEL_3B; break;
-                            case 4096: model.type = e_model::MODEL_7B; break;
-                        } break;
-                }
-            } break;
-        case LLM_ARCH_MPT:
-            {
-                hparams.f_clamp_kqv = 0.0f;
-                GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
-                GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
-                GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
-                switch (hparams.n_layer) {
-                    case 32: model.type = e_model::MODEL_7B; break;
-                    case 48: model.type = e_model::MODEL_30B; break;
-                    default: model.type = e_model::MODEL_UNKNOWN;
-                }
-            } break;
         default: (void)0;
     }
@@ -2268,8 +2206,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: n_gqa            = %u\n",     __func__, hparams.n_gqa());
     LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
     LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
-    LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
-    LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
@@ -2369,8 +2305,8 @@ static void llm_load_tensors(
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2405,8 +2341,8 @@ static void llm_load_tensors(
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                         auto & layer = model.layers[i];
@@ -2435,8 +2371,8 @@ static void llm_load_tensors(
                 {
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2471,8 +2407,8 @@ static void llm_load_tensors(
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                         auto & layer = model.layers[i];
@@ -2505,8 +2441,8 @@ static void llm_load_tensors(
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2543,8 +2479,8 @@ static void llm_load_tensors(
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                         auto & layer = model.layers[i];
@@ -2582,8 +2518,8 @@ static void llm_load_tensors(
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2620,8 +2556,8 @@ static void llm_load_tensors(
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                         auto & layer = model.layers[i];
@@ -2659,8 +2595,8 @@ static void llm_load_tensors(
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2694,8 +2630,8 @@ static void llm_load_tensors(
                     const int i_gpu_start = n_layer - n_gpu_layers;
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
                         auto & layer = model.layers[i];
                         layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
                         layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, backend);
@@ -2715,155 +2651,6 @@ static void llm_load_tensors(
                         layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),   {64}, backend);
                     }
                 } break;
-            case LLM_ARCH_BLOOM:
-                {
-                    // TODO: CPU-only for now
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    model.tok_norm       = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
-                    model.tok_norm_b     = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
-                    // output
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
-#endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                            vram_weights += ggml_nbytes(model.output_norm_b);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
-                    const uint32_t n_ff = hparams.n_ff;
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-                    model.layers.resize(n_layer);
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
-                        auto & layer = model.layers[i];
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend_split);
-                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
-                                ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
-                                ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
-                                ggml_nbytes(layer.ffn_norm)  + ggml_nbytes(layer.ffn_norm_b)  +
-                                ggml_nbytes(layer.w3)        + ggml_nbytes(layer.b3)          +
-                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.b2);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_MPT:
-                {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    // output
-                    {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-                        if (n_gpu_layers > int(n_layer)) {
-                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
-                            // on Windows however this is detrimental unless everything is on the GPU
-#ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
-#else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
-#endif // _WIN32
-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-                        if (backend_norm == GGML_BACKEND_GPU) {
-                            vram_weights += ggml_nbytes(model.output_norm);
-                        }
-                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
-                            vram_weights += ggml_nbytes(model.output);
-                        }
-                    }
-                    const uint32_t n_ff = hparams.n_ff;
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-                    model.layers.resize(n_layer);
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
-                        auto & layer = model.layers[i];
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        if (backend == GGML_BACKEND_GPU) {
-                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) +
-                                ggml_nbytes(layer.wqkv)      +
-                                ggml_nbytes(layer.wo)        +
-                                ggml_nbytes(layer.ffn_norm)  +
-                                ggml_nbytes(layer.w2)        +
-                                ggml_nbytes(layer.w3);
-                        }
-                    }
-                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -4720,6 +4507,7 @@ static struct ggml_cgraph * llm_build_starcoder(
     return gf;
 }
 static struct ggml_cgraph * llm_build_persimmon(
          llama_context & lctx,
      const llama_batch & batch) {
@@ -5117,571 +4905,12 @@ static struct ggml_cgraph * llm_build_persimmon(
     return gf;
 }
-static struct ggml_cgraph * llm_build_bloom(
          llama_context & lctx,
      const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-    const auto & kv_self = lctx.kv_self;
-    GGML_ASSERT(!!kv_self.ctx);
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
-    const float norm_eps = hparams.f_norm_eps;
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-    auto & buf_compute = lctx.buf_compute;
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ false,
-    };
-    params.no_alloc = true;
-    struct ggml_context * ctx0 = ggml_init(params);
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-    struct ggml_tensor * cur;
-    struct ggml_tensor * token;
-    struct ggml_tensor * inpL;
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-        token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-        token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-        ggml_allocr_alloc(lctx.alloc, token);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
-        }
-    }
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-    // norm
-    {
-        inpL = ggml_norm(ctx0, token, norm_eps);
-        inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
-    }
-    ggml_set_name(inpL, "inpL");
-    for (int il = 0; il < n_layer; ++il) {
-        {
-            // Norm
-            cur = ggml_norm(ctx0, inpL, norm_eps);
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
-        }
-        {
-            // Self Attention
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
-            struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
-            struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
-            struct ggml_tensor * Qcur = tmpq;
-            struct ggml_tensor * Kcur = tmpk;
-            // store key and value to memory
-            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
-                ggml_set_name(Vcur, "Vcur");
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                ggml_set_name(k, "k");
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            ggml_set_name(K, "K");
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            ggml_set_name(KQ, "KQ");
-            // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
-            ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-            ggml_set_name(KQ_masked, "KQ_masked");
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-            // split cached V into n_head heads
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            ggml_set_name(V, "V");
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            ggml_set_name(KQV, "KQV");
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            ggml_set_name(KQV_merged, "KQV_merged");
-            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-        }
-        // Projection
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
-        // Add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        struct ggml_tensor * inpFF = cur;
-        // FF
-        {
-            // Norm
-            {
-                cur = ggml_norm(ctx0, inpFF, norm_eps);
-                cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
-            }
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-            // Projection
-            cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
-        }
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-    // Output Norm
-    {
-        cur = ggml_norm(ctx0, inpL, norm_eps);
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
-    }
-    ggml_set_name(cur, "result_norm");
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
-    ggml_build_forward_expand(gf, cur);
-    ggml_free(ctx0);
-    return gf;
-}
-static struct ggml_cgraph * llm_build_mpt(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-    const auto & kv_self = lctx.kv_self;
-    GGML_ASSERT(!!kv_self.ctx);
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = cparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
-    const float norm_eps       = hparams.f_norm_eps;
-    const float clamp_kqv      = hparams.f_clamp_kqv;
-    const float max_alibi_bias = hparams.f_max_alibi_bias;
-    const int n_gpu_layers = model.n_gpu_layers;
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
-    auto & buf_compute = lctx.buf_compute;
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ false,
-    };
-    params.no_alloc = true;
-    struct ggml_context * ctx0 = ggml_init(params);
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-    struct ggml_tensor * cur;
-    struct ggml_tensor * inpL;
-    //int warmup = 0;
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        ggml_allocr_alloc(lctx.alloc, inp_tokens);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
-            //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
-        }
-        ggml_set_name(inp_tokens, "inp_tokens");
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-        ggml_allocr_alloc(lctx.alloc, inpL);
-        if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
-        }
-    }
-    const int i_gpu_start = n_layer - n_gpu_layers;
-    (void) i_gpu_start;
-    // offload functions set the tensor output backend to GPU
-    // tensors are GPU-accelerated if any input or the output has been offloaded
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > n_layer) {
-        offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 1) {
-        offload_func_v  = ggml_cuda_assign_buffers_no_alloc;
-    }
-    if (n_gpu_layers > n_layer + 2) {
-        offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
-    }
-#endif // GGML_USE_CUBLAS
-    // KQ_scale
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-    ggml_allocr_alloc(lctx.alloc, KQ_scale);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    }
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    offload_func_kq(KQ_mask);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(lctx.alloc, KQ_mask);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        float * data = (float *) KQ_mask->data;
-        memset(data, 0, ggml_nbytes(KQ_mask));
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const llama_pos    pos    = batch.pos[j];
-                const llama_seq_id seq_id = batch.seq_id[j];
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-    }
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * attn_norm;
-        offload_func_t offload_func = llama_nop;
-#ifdef GGML_USE_CUBLAS
-        if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers_no_alloc;
-        }
-#endif // GGML_USE_CUBLAS
-        // self-attention
-        // TODO: refactor into common function (shared with LLaMA)
-        {
-            attn_norm = ggml_norm(ctx0, inpL, norm_eps);
-            offload_func(attn_norm);
-            attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
-            offload_func(attn_norm);
-            if (1) {
-                cur = attn_norm;
-            }
-            // compute QKV
-            cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
-            offload_func_kq(cur);
-            if (clamp_kqv > 0.0f) {
-                cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
-                offload_func_kq(cur);
-            }
-            const size_t wsize = ggml_type_size(cur->type);
-            struct ggml_tensor * Qcur = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                0);
-            offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head *  n_head);
-            offload_func_kq(Kcur);
-            struct ggml_tensor * tmpv = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
-                wsize * n_embd_head,
-                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head * (n_head +     n_head_kv));
-            offload_func_kq(Kcur);
-            ggml_set_name(Qcur, "Qcur");
-            ggml_set_name(Kcur, "Kcur");
-            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
-                offload_func_v(Vcur);
-                offload_func_v(Vcur->src[0]->src[0]);
-                ggml_set_name(Vcur, "Vcur");
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
-                offload_func_kq(k);
-                ggml_set_name(k, "k");
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
-                offload_func_v(v);
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-            offload_func_kq(Q);
-            ggml_set_name(Q, "Q");
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_kv, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
-            offload_func_kq(K);
-            ggml_set_name(K, "K");
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            offload_func_kq(KQ);
-            ggml_set_name(KQ, "KQ");
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            offload_func_kq(KQ_scaled);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-            // TODO: replace with ggml_add()
-            struct ggml_tensor * KQ_scaled_alibi =
-                ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
-            offload_func_kq(KQ_scaled_alibi);
-            ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-            offload_func_kq(KQ_masked);
-            ggml_set_name(KQ_masked, "KQ_masked");
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            offload_func_v(KQ_soft_max);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
-            offload_func_v(V);
-            ggml_set_name(V, "V");
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            offload_func_v(KQV);
-            ggml_set_name(KQV, "KQV");
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            offload_func_v(KQV_merged);
-            ggml_set_name(KQV_merged, "KQV_merged");
-            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
-            offload_func_v(cur);
-            ggml_set_name(cur, "KQV_merged_contiguous");
-            cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
-            offload_func(cur);
-            ggml_set_name(cur, "result_wo");
-        }
-        // Add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        offload_func(cur);
-        struct ggml_tensor * attn_out = cur;
-        // feed forward
-        {
-            // Norm
-            {
-                cur = ggml_norm(ctx0, attn_out, norm_eps);
-                offload_func(cur);
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                offload_func(cur);
-            }
-            cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
-            offload_func(cur);
-            cur = ggml_gelu(ctx0, cur);
-            offload_func(cur);
-            cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
-            offload_func(cur);
-        }
-        cur = ggml_add(ctx0, cur, attn_out);
-        offload_func(cur);
-        // input for next layer
-        inpL = cur;
-    }
-    cur = inpL;
-    // norm
-    {
-        cur = ggml_norm(ctx0, cur, norm_eps);
-        offload_func_nr(cur);
-        cur = ggml_mul(ctx0, cur, model.output_norm);
-        ggml_set_name(cur, "result_norm");
-    }
-    cur = ggml_mul_mat(ctx0, model.output, cur);
-    ggml_set_name(cur, "result_output");
-    ggml_build_forward_expand(gf, cur);
-    ggml_free(ctx0);
-    return gf;
-}
-static struct ggml_cgraph * llama_build_graph(
-         llama_context & lctx,
-     const llama_batch & batch) {
-    const auto & model = lctx.model;
-    struct ggml_cgraph * result = NULL;
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
@@ -5708,14 +4937,6 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm_build_refact(lctx, batch);
             } break;
-        case LLM_ARCH_BLOOM:
-            {
-                result = llm_build_bloom(lctx, batch);
-            } break;
-        case LLM_ARCH_MPT:
-            {
-                result = llm_build_mpt(lctx, batch);
-            } break;
         default:
             GGML_ASSERT(false);
     }
@@ -5846,8 +5067,7 @@ static int llama_decode_internal(
     const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
         model.arch == LLM_ARCH_BAICHUAN ||
         model.arch == LLM_ARCH_FALCON ||
-        model.arch == LLM_ARCH_REFACT ||
-        model.arch == LLM_ARCH_MPT;
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
     if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
         n_threads = 1;
@@ -6348,6 +5568,7 @@ private:
         for (int i = 0; i < (int)text_utf.size(); i++) {
             const std::string & utf_char = text_utf[i];
             bool split_condition = false;
             int bytes_remain = text_utf.size() - i;
             // forward backward lookups
             const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -6373,9 +5594,9 @@ private:
             if (!split_condition && bytes_remain >= 3) {
                 // 're|'ve|'ll
                 if (utf_char == "\'" && (
-                    (utf_char_next == "r" && utf_char_next_next == "e") ||
-                    (utf_char_next == "v" && utf_char_next_next == "e") ||
-                    (utf_char_next == "l" && utf_char_next_next == "l"))
                     ) {
                     split_condition = true;
                 }
@@ -6426,7 +5647,7 @@ private:
                 else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
                     split_condition = true;
                 }
-                else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
                     split_condition = true;
                 }
             }
@@ -7945,7 +7166,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         const std::string name = ggml_get_name(meta);
         // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
             ++n_attention_wv;
         }
         else if (name.find("ffn_down.weight") != std::string::npos) {

     LLM_ARCH_STARCODER,
     LLM_ARCH_PERSIMMON,
     LLM_ARCH_REFACT,
     LLM_ARCH_UNKNOWN,
 };
     { LLM_ARCH_BAICHUAN,        "baichuan"  },
     { LLM_ARCH_STARCODER,       "starcoder" },
     { LLM_ARCH_PERSIMMON,       "persimmon" },
+    { LLM_ARCH_REFACT,          "refact" },
 };
 enum llm_kv {
 enum llm_tensor {
     LLM_TENSOR_TOKEN_EMBD,
     LLM_TENSOR_POS_EMBD,
     LLM_TENSOR_OUTPUT,
     LLM_TENSOR_OUTPUT_NORM,
         LLM_ARCH_MPT,
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
         },
     },
     {
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
     {
         LLM_ARCH_UNKNOWN,
         {
     float rope_freq_base_train;
     float rope_freq_scale_train;
     bool operator!=(const llama_hparams & other) const {
         if (this->vocab_only != other.vocab_only) return true;
         if (this->n_vocab != other.n_vocab) return true;
     struct ggml_tensor * tok_embeddings;
     struct ggml_tensor * pos_embeddings;
     struct ggml_tensor * output_norm;
     struct ggml_tensor * output_norm_b;
     cache.cells.clear();
     cache.cells.resize(n_ctx);
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
     struct ggml_init_params params;
     params.mem_size   = cache.buf.size;
         }
     }
+    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
         if (backend != GGML_BACKEND_CPU) {
             ggml_set_no_alloc(ctx, true);
         }
         return tensor;
     }
+    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
         struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
         if (cur == NULL) {
                 }
             } break;
         case LLM_ARCH_PERSIMMON:
+        {
+            GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
+            switch (hparams.n_layer) {
+                case 36: model.type = e_model::MODEL_8B; break;
+                default: model.type = e_model::MODEL_UNKNOWN;
+            }
+        } break;
         case LLM_ARCH_REFACT:
             {
                 GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
         default: (void)0;
     }
     LLAMA_LOG_INFO("%s: n_gqa            = %u\n",     __func__, hparams.n_gqa());
     LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
     LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
                     // output
                     {
+                        ggml_backend backend_norm;
+                        ggml_backend backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                         auto & layer = model.layers[i];
                 {
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
                     {
+                        ggml_backend backend_norm;
+                        ggml_backend backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                         auto & layer = model.layers[i];
                     // output
                     {
+                        ggml_backend backend_norm;
+                        ggml_backend backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                         auto & layer = model.layers[i];
                     // output
                     {
+                        ggml_backend backend_norm;
+                        ggml_backend backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                         auto & layer = model.layers[i];
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
                     {
+                        ggml_backend backend_norm;
+                        ggml_backend backend_output;
                         if (n_gpu_layers > int(n_layer)) {
                             // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                     const int i_gpu_start = n_layer - n_gpu_layers;
                     model.layers.resize(n_layer);
                     for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
                         auto & layer = model.layers[i];
                         layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
                         layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, backend);
                         layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),   {64}, backend);
                     }
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
     return gf;
 }
 static struct ggml_cgraph * llm_build_persimmon(
          llama_context & lctx,
      const llama_batch & batch) {
     return gf;
 }
+static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
      const llama_batch & batch) {
+    const auto & model = lctx.model;
+    struct ggml_cgraph * result = NULL;
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
             {
                 result = llm_build_refact(lctx, batch);
             } break;
         default:
             GGML_ASSERT(false);
     }
     const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
         model.arch == LLM_ARCH_BAICHUAN ||
         model.arch == LLM_ARCH_FALCON ||
+        model.arch == LLM_ARCH_REFACT;
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
     if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
         n_threads = 1;
         for (int i = 0; i < (int)text_utf.size(); i++) {
             const std::string & utf_char = text_utf[i];
             bool split_condition = false;
+            // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
             int bytes_remain = text_utf.size() - i;
             // forward backward lookups
             const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
             if (!split_condition && bytes_remain >= 3) {
                 // 're|'ve|'ll
                 if (utf_char == "\'" && (
+                    (utf_char_next == "r" || utf_char_next_next == "e") ||
+                    (utf_char_next == "v" || utf_char_next_next == "e") ||
+                    (utf_char_next == "l" || utf_char_next_next == "l"))
                     ) {
                     split_condition = true;
                 }
                 else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
                     split_condition = true;
                 }
+                else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
                     split_condition = true;
                 }
             }
         const std::string name = ggml_get_name(meta);
         // TODO: avoid hardcoded tensor names - use the TN_* constants
+        if (name.find("attn_v.weight") != std::string::npos) {
             ++n_attention_wv;
         }
         else if (name.find("ffn_down.weight") != std::string::npos) {

otherarch/llama_v3.cpp CHANGED Viewed

@@ -63,8 +63,9 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
 #define LLAMA_V3_LOG_WARN(...)  llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
-#include "ggml-alloc.h"
 #if !defined(GGML_USE_CUBLAS)
 #define LLAMA_V3_USE_ALLOCATOR
 #else
 #define LLAMA_V3_USE_SCRATCH
@@ -724,7 +725,7 @@ struct llama_v3_model_loader {
         }
     }
-    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend_type backend) {
         auto it = tensors_map.name_to_idx.find(name);
         if (it == tensors_map.name_to_idx.end()) {
             throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
@@ -738,7 +739,7 @@ struct llama_v3_model_loader {
         return get_tensor_for(lt, backend);
     }
-    struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend_type backend) {
         struct ggml_tensor * tensor;
         if (backend != GGML_BACKEND_CPU) {
             ggml_set_no_alloc(ggml_ctx, true);
@@ -1229,8 +1230,8 @@ static void llama_v3_model_load_internal(
         // "output" tensor
         {
-            ggml_backend_type backend_norm;
-            ggml_backend_type backend_output;
             if (n_gpu_layers > int(n_layer)) { // NOLINT
                 // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                 // on Windows however this is detrimental unless everything is on the GPU
@@ -1260,8 +1261,8 @@ static void llama_v3_model_load_internal(
         model.layers.resize(n_layer);
         for (uint32_t i = 0; i < n_layer; ++i) {
-            const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
-            const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
             auto & layer = model.layers[i];

 #define LLAMA_V3_LOG_WARN(...)  llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
 #if !defined(GGML_USE_CUBLAS)
+#include "ggml-alloc.h"
 #define LLAMA_V3_USE_ALLOCATOR
 #else
 #define LLAMA_V3_USE_SCRATCH
         }
     }
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
         auto it = tensors_map.name_to_idx.find(name);
         if (it == tensors_map.name_to_idx.end()) {
             throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
         return get_tensor_for(lt, backend);
     }
+    struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend backend) {
         struct ggml_tensor * tensor;
         if (backend != GGML_BACKEND_CPU) {
             ggml_set_no_alloc(ggml_ctx, true);
         // "output" tensor
         {
+            ggml_backend backend_norm;
+            ggml_backend backend_output;
             if (n_gpu_layers > int(n_layer)) { // NOLINT
                 // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                 // on Windows however this is detrimental unless everything is on the GPU
         model.layers.resize(n_layer);
         for (uint32_t i = 0; i < n_layer; ++i) {
+            const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
+            const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
             auto & layer = model.layers[i];

spm-headers/ggml.h CHANGED Viewed

@@ -326,7 +326,7 @@ extern "C" {
         GGML_TYPE_COUNT,
     };
-    enum ggml_backend_type {
         GGML_BACKEND_CPU = 0,
         GGML_BACKEND_GPU = 10,
         GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,10 +479,8 @@ extern "C" {
     // n-dimensional tensor
     struct ggml_tensor {
-        enum ggml_type         type;
-        enum ggml_backend_type backend;
-        struct ggml_backend_buffer * buffer;
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -516,7 +514,7 @@ extern "C" {
         void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[12];
     };
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -1360,7 +1358,7 @@ extern "C" {
     // alibi position embedding
     // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_alibi(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   n_past,
@@ -1369,7 +1367,7 @@ extern "C" {
     // clamp
     // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_clamp(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             float                 min,
@@ -2104,7 +2102,7 @@ extern "C" {
         enum ggml_type    vec_dot_type;
     } ggml_type_traits_t;
-    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 #ifdef  __cplusplus
 }

         GGML_TYPE_COUNT,
     };
+    enum ggml_backend {
         GGML_BACKEND_CPU = 0,
         GGML_BACKEND_GPU = 10,
         GGML_BACKEND_GPU_SPLIT = 20,
     // n-dimensional tensor
     struct ggml_tensor {
+        enum ggml_type    type;
+        enum ggml_backend backend;
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
         void * extra; // extra things e.g. for ggml-cuda.cu
+        char padding[4];
     };
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
     // alibi position embedding
     // in-place, returns view(a)
+    struct ggml_tensor * ggml_alibi(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             int                   n_past,
     // clamp
     // in-place, returns view(a)
+    struct ggml_tensor * ggml_clamp(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             float                 min,
         enum ggml_type    vec_dot_type;
     } ggml_type_traits_t;
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 #ifdef  __cplusplus
 }

tests/test-tokenizer-0-falcon.cpp CHANGED Viewed

@@ -36,8 +36,6 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
         { "   Hello"              , {     258,  23090, }, },
         { "    Hello"             , {     466,  23090, }, },
         { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
-        { "\n ="                  , {    1212,     40, }, },
-        { "' era"                 , {      18,   4932, }, },
     };
     return _k_tests;
@@ -157,7 +155,7 @@ int main(int argc, char **argv) {
         fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
         fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
@@ -171,8 +169,10 @@ int main(int argc, char **argv) {
             }
             for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
             }
         }
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());

         { "   Hello"              , {     258,  23090, }, },
         { "    Hello"             , {     466,  23090, }, },
         { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
     };
     return _k_tests;
         fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
         fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
             }
             for (const auto & tok : res) {
+                ofs << tok << " ";
             }
+            ofs << "\n";
         }
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());

tests/test-tokenizer-0-falcon.py CHANGED Viewed

@@ -41,8 +41,6 @@ tests = [
         "   Hello",
         "    Hello",
         "    Hello\n    Hello",
-        "\n =",
-        "' era",
     ]
 for text in tests:
@@ -71,14 +69,15 @@ fname_tok = args.fname_tok
 if fname_tok:
     print('tokenizing file: ', fname_tok)
     fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r', encoding='utf-8') as f:
         lines = f.readlines()
         s = ''.join(lines)
         res = tokenizer.encode(s)
         # write to file
-        with open(fname_out, 'w', encoding='utf-8') as f:
             for x in res:
-                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
         print('len(res): ', len(res))
         print('len(lines): ', len(lines))
     print('results written to: ', fname_out)

         "   Hello",
         "    Hello",
         "    Hello\n    Hello",
     ]
 for text in tests:
 if fname_tok:
     print('tokenizing file: ', fname_tok)
     fname_out = fname_tok + '.tok'
+    with open(fname_tok, 'r') as f:
         lines = f.readlines()
         s = ''.join(lines)
         res = tokenizer.encode(s)
         # write to file
+        with open(fname_out, 'w') as f:
             for x in res:
+                f.write(str(x) + ' ')
+            f.write('\n')
         print('len(res): ', len(res))
         print('len(lines): ', len(lines))
     print('results written to: ', fname_out)

tests/test-tokenizer-0-llama.cpp CHANGED Viewed

@@ -174,8 +174,10 @@ int main(int argc, char **argv) {
             }
             for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
             }
         }
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());

             }
             for (const auto & tok : res) {
+                ofs << tok << " ";
             }
+            ofs << "\n";
         }
         fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());

tests/test-tokenizer-0-llama.py CHANGED Viewed

@@ -81,14 +81,15 @@ fname_tok = args.fname_tok
 if fname_tok:
     print('tokenizing file: ', fname_tok)
     fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r', encoding='utf-8') as f:
         lines = f.readlines()
         s = ''.join(lines)
         res = tokenizer.encode(s, add_bos=True)
         # write to file
-        with open(fname_out, 'w', encoding='utf-8') as f:
             for x in res:
-                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
         print('len(res): ', len(res))
         print('len(lines): ', len(lines))
     print('results written to: ', fname_out)

 if fname_tok:
     print('tokenizing file: ', fname_tok)
     fname_out = fname_tok + '.tok'
+    with open(fname_tok, 'r') as f:
         lines = f.readlines()
         s = ''.join(lines)
         res = tokenizer.encode(s, add_bos=True)
         # write to file
+        with open(fname_out, 'w') as f:
             for x in res:
+                f.write(str(x) + ' ')
+            f.write('\n')
         print('len(res): ', len(res))
         print('len(lines): ', len(lines))
     print('results written to: ', fname_out)