Illumotion commited on
Commit
57c742e
1 Parent(s): 6a6900d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +4 -0
  2. CMakeLists.txt +2 -0
  3. Dockerfile +11 -4
  4. Makefile +16 -14
  5. Package.swift +4 -3
  6. colab.ipynb +1 -1
  7. common/CMakeLists.txt +2 -0
  8. common/common.cpp +56 -172
  9. common/common.h +4 -39
  10. common/sampling.cpp +166 -0
  11. common/sampling.h +108 -0
  12. convert-bloom-hf-to-gguf.py +238 -0
  13. convert-mpt-hf-to-gguf.py +216 -0
  14. convert-refact-hf-to-gguf.py +263 -0
  15. examples/CMakeLists.txt +1 -0
  16. examples/batched-bench/CMakeLists.txt +5 -0
  17. examples/batched-bench/README.md +51 -0
  18. examples/batched-bench/batched-bench.cpp +251 -0
  19. examples/batched.swift/.gitignore +9 -0
  20. examples/batched.swift/Makefile +6 -0
  21. examples/batched.swift/Package.swift +22 -0
  22. examples/batched.swift/README.md +4 -0
  23. examples/batched.swift/Sources/main.swift +255 -0
  24. examples/batched/batched.cpp +1 -1
  25. examples/embd-input/embd-input-lib.cpp +10 -9
  26. examples/infill/infill.cpp +800 -0
  27. examples/main/main.cpp +17 -13
  28. examples/parallel/parallel.cpp +57 -9
  29. examples/save-load-state/save-load-state.cpp +3 -2
  30. examples/server/index.html.hpp +0 -0
  31. examples/server/public/index.html +133 -58
  32. examples/server/server.cpp +308 -145
  33. examples/speculative/speculative.cpp +13 -5
  34. ggml-alloc.c +62 -107
  35. ggml-alloc.h +11 -5
  36. ggml-backend.c +385 -0
  37. ggml-backend.h +143 -0
  38. ggml-cuda.cu +500 -78
  39. ggml-cuda.h +4 -0
  40. ggml-metal.h +18 -1
  41. ggml-metal.m +152 -9
  42. ggml-metal.metal +12 -6
  43. ggml.c +23 -45
  44. ggml.h +9 -7
  45. gguf-py/gguf/gguf.py +70 -42
  46. gpttype_adapter.cpp +1 -1
  47. koboldcpp.py +105 -53
  48. llama.cpp +844 -65
  49. otherarch/llama_v3.cpp +7 -8
  50. prompts/mnemonics.txt +93 -0
.gitignore CHANGED
@@ -45,6 +45,7 @@ models-mnt
45
  /server
46
  /simple
47
  /batched
 
48
  /export-lora
49
  /finetune
50
  /speculative
@@ -106,3 +107,6 @@ tests/test-tokenizer-1-bpe
106
  rocblas.dll
107
  hipblas.dll
108
  koboldcpp_hipblas.so
 
 
 
 
45
  /server
46
  /simple
47
  /batched
48
+ /batched-bench
49
  /export-lora
50
  /finetune
51
  /speculative
 
107
  rocblas.dll
108
  hipblas.dll
109
  koboldcpp_hipblas.so
110
+
111
+ # Jetbrains idea folder
112
+ .idea/
CMakeLists.txt CHANGED
@@ -356,6 +356,8 @@ add_library(ggml OBJECT
356
  ggml.h
357
  ggml-alloc.c
358
  ggml-alloc.h
 
 
359
  k_quants.h
360
  k_quants.c
361
  ${GGML_SOURCES_CUDA})
 
356
  ggml.h
357
  ggml-alloc.c
358
  ggml-alloc.h
359
+ ggml-backend.c
360
+ ggml-backend.h
361
  k_quants.h
362
  k_quants.c
363
  ${GGML_SOURCES_CUDA})
Dockerfile CHANGED
@@ -2,10 +2,17 @@ FROM python
2
  WORKDIR /app
3
  COPY . .
4
  RUN apt update \
5
- && apt install build-essential wget libopenblas-dev make -y \
6
- && make LLAMA_OPENBLAS=1 \
7
  && wget https://huggingface.co/TheBloke/Pygmalion-2-7B-GGUF/resolve/main/pygmalion-2-7b.Q6_K.gguf \
8
- && apt remove build-essential wget make -y \
9
- && rm -fr *.bat convert-* ci docs examples otherarchs tests
 
 
 
 
 
 
 
10
 
11
  ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-2-7b.Q6_K.gguf", "--port", "7860", "--smartcontext"]
 
2
  WORKDIR /app
3
  COPY . .
4
  RUN apt update \
5
+ && apt install build-essential wget libopenblas-dev make cmake -y \
6
+ && mkdir build \
7
  && wget https://huggingface.co/TheBloke/Pygmalion-2-7B-GGUF/resolve/main/pygmalion-2-7b.Q6_K.gguf \
8
+ https://github.com/mozilla/sccache/releases/download/v0.5.4/sccache-dist-v0.5.4-x86_64-unknown-linux-musl.tar.gz \
9
+ && tar -vxzf sccache-dist-v0.5.4-x86_64-unknown-linux-musl.tar.gz \
10
+ && mv sccache-dist-v0.5.4-x86_64-unknown-linux-musl/sccache /usr/bin/sccache\
11
+ && cd build \
12
+ && cmake .. -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
13
+ && cmake --build . \
14
+ && cd .. \
15
+ && apt remove build-essential wget make cmake -y \
16
+ && rm -fr *.bat convert-* ci docs examples otherarchs tests sccache-dist-v0.5.4-x86_64-unknown-linux-musl*
17
 
18
  ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-2-7b.Q6_K.gguf", "--port", "7860", "--smartcontext"]
Makefile CHANGED
@@ -372,6 +372,8 @@ endif # LLAMA_NO_K_QUANTS
372
  #there's no intrinsics or special gpu ops used here, so we can have a universal object
373
  ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
374
  $(CC) $(CFLAGS) -c $< -o $@
 
 
375
 
376
  #version 2 libs
377
  ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
@@ -402,7 +404,7 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
402
  $(CC) $(CFLAGS) -c $< -o $@
403
 
404
  # intermediate objects
405
- llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
406
  $(CXX) $(CXXFLAGS) -c $< -o $@
407
  common.o: common/common.cpp common/common.h common/log.h
408
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -427,7 +429,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
427
  clean:
428
  rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
429
 
430
- main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
431
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
432
  @echo
433
  @echo '==== Run ./main -h for help. ===='
@@ -438,11 +440,11 @@ gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
438
 
439
 
440
  #generated libraries
441
- koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
442
  $(DEFAULT_BUILD)
443
 
444
  ifdef OPENBLAS_BUILD
445
- koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
446
  $(OPENBLAS_BUILD)
447
  else
448
  koboldcpp_openblas:
@@ -450,7 +452,7 @@ koboldcpp_openblas:
450
  endif
451
 
452
  ifdef FAILSAFE_BUILD
453
- koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o grammar-parser.o $(OBJS)
454
  $(FAILSAFE_BUILD)
455
  else
456
  koboldcpp_failsafe:
@@ -458,7 +460,7 @@ koboldcpp_failsafe:
458
  endif
459
 
460
  ifdef NOAVX2_BUILD
461
- koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o grammar-parser.o $(OBJS)
462
  $(NOAVX2_BUILD)
463
  else
464
  koboldcpp_noavx2:
@@ -466,7 +468,7 @@ koboldcpp_noavx2:
466
  endif
467
 
468
  ifdef CLBLAST_BUILD
469
- koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
470
  $(CLBLAST_BUILD)
471
  else
472
  koboldcpp_clblast:
@@ -474,7 +476,7 @@ koboldcpp_clblast:
474
  endif
475
 
476
  ifdef CUBLAS_BUILD
477
- koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
478
  $(CUBLAS_BUILD)
479
  else
480
  koboldcpp_cublas:
@@ -482,7 +484,7 @@ koboldcpp_cublas:
482
  endif
483
 
484
  ifdef HIPBLAS_BUILD
485
- koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
486
  $(HIPBLAS_BUILD)
487
  else
488
  koboldcpp_hipblas:
@@ -490,15 +492,15 @@ koboldcpp_hipblas:
490
  endif
491
 
492
  # tools
493
- quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o
494
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
495
- quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
496
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
497
- quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
498
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
499
- quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
500
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
501
- quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
502
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
503
 
504
 
 
372
  #there's no intrinsics or special gpu ops used here, so we can have a universal object
373
  ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
374
  $(CC) $(CFLAGS) -c $< -o $@
375
+ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
376
+ $(CC) $(CFLAGS) -c $< -o $@
377
 
378
  #version 2 libs
379
  ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
 
404
  $(CC) $(CFLAGS) -c $< -o $@
405
 
406
  # intermediate objects
407
+ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
408
  $(CXX) $(CXXFLAGS) -c $< -o $@
409
  common.o: common/common.cpp common/common.h common/log.h
410
  $(CXX) $(CXXFLAGS) -c $< -o $@
 
429
  clean:
430
  rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
431
 
432
+ main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
433
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
434
  @echo
435
  @echo '==== Run ./main -h for help. ===='
 
440
 
441
 
442
  #generated libraries
443
+ koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
444
  $(DEFAULT_BUILD)
445
 
446
  ifdef OPENBLAS_BUILD
447
+ koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
448
  $(OPENBLAS_BUILD)
449
  else
450
  koboldcpp_openblas:
 
452
  endif
453
 
454
  ifdef FAILSAFE_BUILD
455
+ koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
456
  $(FAILSAFE_BUILD)
457
  else
458
  koboldcpp_failsafe:
 
460
  endif
461
 
462
  ifdef NOAVX2_BUILD
463
+ koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
464
  $(NOAVX2_BUILD)
465
  else
466
  koboldcpp_noavx2:
 
468
  endif
469
 
470
  ifdef CLBLAST_BUILD
471
+ koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
472
  $(CLBLAST_BUILD)
473
  else
474
  koboldcpp_clblast:
 
476
  endif
477
 
478
  ifdef CUBLAS_BUILD
479
+ koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
480
  $(CUBLAS_BUILD)
481
  else
482
  koboldcpp_cublas:
 
484
  endif
485
 
486
  ifdef HIPBLAS_BUILD
487
+ koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(HIP_OBJS) $(OBJS)
488
  $(HIPBLAS_BUILD)
489
  else
490
  koboldcpp_hipblas:
 
492
  endif
493
 
494
  # tools
495
+ quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o
496
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
497
+ quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
498
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
499
+ quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
500
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
501
+ quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
502
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
503
+ quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
504
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
505
 
506
 
Package.swift CHANGED
@@ -1,10 +1,10 @@
1
- // swift-tools-version:5.3
2
 
3
  import PackageDescription
4
 
5
  #if arch(arm) || arch(arm64)
6
  let platforms: [SupportedPlatform]? = [
7
- .macOS(.v11),
8
  .iOS(.v14),
9
  .watchOS(.v4),
10
  .tvOS(.v14)
@@ -41,12 +41,13 @@ let package = Package(
41
  "ggml.c",
42
  "llama.cpp",
43
  "ggml-alloc.c",
 
44
  "k_quants.c",
45
  ] + additionalSources,
46
  resources: resources,
47
  publicHeadersPath: "spm-headers",
48
  cSettings: [
49
- .unsafeFlags(["-Wno-shorten-64-to-32"]),
50
  .define("GGML_USE_K_QUANTS"),
51
  .define("GGML_USE_ACCELERATE")
52
  // NOTE: NEW_LAPACK will required iOS version 16.4+
 
1
+ // swift-tools-version:5.5
2
 
3
  import PackageDescription
4
 
5
  #if arch(arm) || arch(arm64)
6
  let platforms: [SupportedPlatform]? = [
7
+ .macOS(.v12),
8
  .iOS(.v14),
9
  .watchOS(.v4),
10
  .tvOS(.v14)
 
41
  "ggml.c",
42
  "llama.cpp",
43
  "ggml-alloc.c",
44
+ "ggml-backend.c",
45
  "k_quants.c",
46
  ] + additionalSources,
47
  resources: resources,
48
  publicHeadersPath: "spm-headers",
49
  cSettings: [
50
+ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
51
  .define("GGML_USE_K_QUANTS"),
52
  .define("GGML_USE_ACCELERATE")
53
  // NOTE: NEW_LAPACK will required iOS version 16.4+
colab.ipynb CHANGED
@@ -33,7 +33,7 @@
33
  "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
34
  "!sleep 10\r\n",
35
  "!cat nohup.out\r\n",
36
- "!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers --hordeconfig concedo\r\n"
37
  ]
38
  }
39
  ],
 
33
  "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
34
  "!sleep 10\r\n",
35
  "!cat nohup.out\r\n",
36
+ "!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers\r\n"
37
  ]
38
  }
39
  ],
common/CMakeLists.txt CHANGED
@@ -5,6 +5,8 @@ set(TARGET common)
5
  add_library(${TARGET} OBJECT
6
  common.h
7
  common.cpp
 
 
8
  console.h
9
  console.cpp
10
  grammar-parser.h
 
5
  add_library(${TARGET} OBJECT
6
  common.h
7
  common.cpp
8
+ sampling.h
9
+ sampling.cpp
10
  console.h
11
  console.cpp
12
  grammar-parser.h
common/common.cpp CHANGED
@@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
107
  std::string arg;
108
  gpt_params default_params;
109
  const std::string arg_prefix = "--";
 
110
 
111
  for (int i = 1; i < argc; i++) {
112
  arg = argv[i];
@@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
184
  invalid_param = true;
185
  break;
186
  }
187
- params.top_k = std::stoi(argv[i]);
188
  } else if (arg == "-c" || arg == "--ctx-size") {
189
  if (++i >= argc) {
190
  invalid_param = true;
@@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
216
  invalid_param = true;
217
  break;
218
  }
219
- params.top_p = std::stof(argv[i]);
220
  } else if (arg == "--temp") {
221
  if (++i >= argc) {
222
  invalid_param = true;
223
  break;
224
  }
225
- params.temp = std::stof(argv[i]);
226
  } else if (arg == "--tfs") {
227
  if (++i >= argc) {
228
  invalid_param = true;
229
  break;
230
  }
231
- params.tfs_z = std::stof(argv[i]);
232
  } else if (arg == "--typical") {
233
  if (++i >= argc) {
234
  invalid_param = true;
235
  break;
236
  }
237
- params.typical_p = std::stof(argv[i]);
238
  } else if (arg == "--repeat-last-n") {
239
  if (++i >= argc) {
240
  invalid_param = true;
241
  break;
242
  }
243
- params.repeat_last_n = std::stoi(argv[i]);
244
  } else if (arg == "--repeat-penalty") {
245
  if (++i >= argc) {
246
  invalid_param = true;
247
  break;
248
  }
249
- params.repeat_penalty = std::stof(argv[i]);
250
  } else if (arg == "--frequency-penalty") {
251
  if (++i >= argc) {
252
  invalid_param = true;
253
  break;
254
  }
255
- params.frequency_penalty = std::stof(argv[i]);
256
  } else if (arg == "--presence-penalty") {
257
  if (++i >= argc) {
258
  invalid_param = true;
259
  break;
260
  }
261
- params.presence_penalty = std::stof(argv[i]);
262
  } else if (arg == "--mirostat") {
263
  if (++i >= argc) {
264
  invalid_param = true;
265
  break;
266
  }
267
- params.mirostat = std::stoi(argv[i]);
268
  } else if (arg == "--mirostat-lr") {
269
  if (++i >= argc) {
270
  invalid_param = true;
271
  break;
272
  }
273
- params.mirostat_eta = std::stof(argv[i]);
274
  } else if (arg == "--mirostat-ent") {
275
  if (++i >= argc) {
276
  invalid_param = true;
277
  break;
278
  }
279
- params.mirostat_tau = std::stof(argv[i]);
280
  } else if (arg == "--cfg-negative-prompt") {
281
  if (++i >= argc) {
282
  invalid_param = true;
283
  break;
284
  }
285
- params.cfg_negative_prompt = argv[i];
286
  } else if (arg == "--cfg-negative-prompt-file") {
287
  if (++i >= argc) {
288
  invalid_param = true;
@@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
294
  invalid_param = true;
295
  break;
296
  }
297
- std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
298
- if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
299
- params.cfg_negative_prompt.pop_back();
300
  }
301
  } else if (arg == "--cfg-scale") {
302
  if (++i >= argc) {
303
  invalid_param = true;
304
  break;
305
  }
306
- params.cfg_scale = std::stof(argv[i]);
307
  } else if (arg == "-b" || arg == "--batch-size") {
308
  if (++i >= argc) {
309
  invalid_param = true;
@@ -512,7 +513,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
512
  } else if (arg == "--ignore-eos") {
513
  params.ignore_eos = true;
514
  } else if (arg == "--no-penalize-nl") {
515
- params.penalize_nl = false;
516
  } else if (arg == "-l" || arg == "--logit-bias") {
517
  if (++i >= argc) {
518
  invalid_param = true;
@@ -524,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
524
  std::string value_str;
525
  try {
526
  if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
527
- params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
528
  } else {
529
  throw std::exception();
530
  }
@@ -627,6 +628,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
627
  }
628
 
629
  void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 
 
630
  printf("usage: %s [options]\n", argv[0]);
631
  printf("\n");
632
  printf("options:\n");
@@ -659,19 +662,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
659
  printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
660
  printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
661
  printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
662
- printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
663
- printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
664
- printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
665
- printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
666
- printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
667
- printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
668
- printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
669
- printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
670
  printf(" --mirostat N use Mirostat sampling.\n");
671
  printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
672
- printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
673
- printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
674
- printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
675
  printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
676
  printf(" modifies the likelihood of token appearing in the completion,\n");
677
  printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
@@ -682,7 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
682
  printf(" negative prompt to use for guidance. (default: empty)\n");
683
  printf(" --cfg-negative-prompt-file FNAME\n");
684
  printf(" negative prompt file to use for guidance. (default: empty)\n");
685
- printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
686
  printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
687
  printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
688
  printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
@@ -690,7 +693,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
690
  printf(" --no-penalize-nl do not penalize newline token\n");
691
  printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
692
  printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
693
- printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
694
  printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
695
  printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
696
  printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@@ -840,7 +843,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
840
  }
841
 
842
  if (params.ignore_eos) {
843
- params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
844
  }
845
 
846
  {
@@ -932,127 +935,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
932
  return result;
933
  }
934
 
935
- //
936
- // Sampling utils
937
- //
938
-
939
- llama_token llama_sample_token(
940
- struct llama_context * ctx,
941
- struct llama_context * ctx_guidance,
942
- struct llama_grammar * grammar,
943
- const struct gpt_params & params,
944
- const std::vector<llama_token> & last_tokens,
945
- std::vector<llama_token_data> & candidates,
946
- int idx) {
947
- const int n_ctx = llama_n_ctx(ctx);
948
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
949
-
950
- const float temp = params.temp;
951
- const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
952
- const float top_p = params.top_p;
953
- const float tfs_z = params.tfs_z;
954
- const float typical_p = params.typical_p;
955
- const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
956
- const float repeat_penalty = params.repeat_penalty;
957
- const float alpha_presence = params.presence_penalty;
958
- const float alpha_frequency = params.frequency_penalty;
959
- const int mirostat = params.mirostat;
960
- const float mirostat_tau = params.mirostat_tau;
961
- const float mirostat_eta = params.mirostat_eta;
962
- const bool penalize_nl = params.penalize_nl;
963
-
964
- llama_token id = 0;
965
-
966
- float * logits = llama_get_logits_ith(ctx, idx);
967
-
968
- // Apply params.logit_bias map
969
- for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
970
- logits[it->first] += it->second;
971
- }
972
-
973
- candidates.clear();
974
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
975
- candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
976
- }
977
-
978
- llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
979
-
980
- if (ctx_guidance) {
981
- llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
982
- }
983
-
984
- // apply penalties
985
- if (!last_tokens.empty()) {
986
- const float nl_logit = logits[llama_token_nl(ctx)];
987
- const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
988
-
989
- llama_sample_repetition_penalty(ctx, &cur_p,
990
- last_tokens.data() + last_tokens.size() - last_n_repeat,
991
- last_n_repeat, repeat_penalty);
992
- llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
993
- last_tokens.data() + last_tokens.size() - last_n_repeat,
994
- last_n_repeat, alpha_frequency, alpha_presence);
995
-
996
- if (!penalize_nl) {
997
- for (size_t idx = 0; idx < cur_p.size; idx++) {
998
- if (cur_p.data[idx].id == llama_token_nl(ctx)) {
999
- cur_p.data[idx].logit = nl_logit;
1000
- break;
1001
- }
1002
- }
1003
- }
1004
- }
1005
-
1006
- if (grammar != NULL) {
1007
- llama_sample_grammar(ctx, &cur_p, grammar);
1008
- }
1009
-
1010
- if (temp <= 0) {
1011
- // Greedy sampling
1012
- id = llama_sample_token_greedy(ctx, &cur_p);
1013
- } else {
1014
- if (mirostat == 1) {
1015
- static float mirostat_mu = 2.0f * mirostat_tau;
1016
- const int mirostat_m = 100;
1017
- llama_sample_temp(ctx, &cur_p, temp);
1018
- id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
1019
- } else if (mirostat == 2) {
1020
- static float mirostat_mu = 2.0f * mirostat_tau;
1021
- llama_sample_temp(ctx, &cur_p, temp);
1022
- id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
1023
- } else {
1024
- // Temperature sampling
1025
- size_t min_keep = std::max(1, params.n_probs);
1026
- llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
1027
- llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
1028
- llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
1029
- llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
1030
- llama_sample_temp(ctx, &cur_p, temp);
1031
-
1032
- {
1033
- const int n_top = 10;
1034
- LOG("top %d candidates:\n", n_top);
1035
-
1036
- for (int i = 0; i < n_top; i++) {
1037
- const llama_token id = cur_p.data[i].id;
1038
- LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
1039
- }
1040
- }
1041
-
1042
- id = llama_sample_token(ctx, &cur_p);
1043
-
1044
- LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
1045
- }
1046
- }
1047
- // printf("`%d`", candidates_p.size);
1048
-
1049
- if (grammar != NULL) {
1050
- llama_grammar_accept_token(ctx, grammar, id);
1051
- }
1052
-
1053
- return id;
1054
- }
1055
-
1056
  //
1057
  // YAML utils
1058
  //
@@ -1204,6 +1086,8 @@ std::string get_sortable_timestamp() {
1204
 
1205
  void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
1206
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
 
 
1207
  fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
1208
  fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
1209
  fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
@@ -1250,21 +1134,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1250
 
1251
  fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
1252
  fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
1253
- dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
1254
- fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
1255
  fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
1256
  fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
1257
  fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
1258
  fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
1259
  fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
1260
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
1261
  dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
1262
  fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
1263
  fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
1264
  fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
1265
 
1266
- const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
1267
- const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
1268
  fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
1269
 
1270
  dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
@@ -1277,7 +1161,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1277
  fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
1278
 
1279
  fprintf(stream, "logit_bias:\n");
1280
- for (std::pair<llama_token, float> lb : params.logit_bias) {
1281
  if (ignore_eos && lb.first == logit_bias_eos->first) {
1282
  continue;
1283
  }
@@ -1301,30 +1185,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1301
  fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
1302
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1303
  fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
1304
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
1305
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
1306
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
1307
  fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
1308
  fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
1309
  fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
1310
  fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
1311
  fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
1312
  fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
1313
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
1314
  fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
1315
  fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
1316
- fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
1317
  fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
1318
  fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
1319
  fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
1320
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
1321
  dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
1322
  fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
1323
  fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
1324
  fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
1325
  dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
1326
  fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
1327
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
1328
 
1329
  fprintf(stream, "reverse_prompt:\n");
1330
  for (std::string ap : params.antiprompt) {
@@ -1342,15 +1226,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1342
  fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
1343
  fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
1344
  fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
1345
- fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
1346
 
1347
  const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
1348
  dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
1349
 
1350
- fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
1351
  fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
1352
- fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
1353
- fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
1354
- fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
1355
  fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
1356
  }
 
107
  std::string arg;
108
  gpt_params default_params;
109
  const std::string arg_prefix = "--";
110
+ llama_sampling_params & sparams = params.sampling_params;
111
 
112
  for (int i = 1; i < argc; i++) {
113
  arg = argv[i];
 
185
  invalid_param = true;
186
  break;
187
  }
188
+ sparams.top_k = std::stoi(argv[i]);
189
  } else if (arg == "-c" || arg == "--ctx-size") {
190
  if (++i >= argc) {
191
  invalid_param = true;
 
217
  invalid_param = true;
218
  break;
219
  }
220
+ sparams.top_p = std::stof(argv[i]);
221
  } else if (arg == "--temp") {
222
  if (++i >= argc) {
223
  invalid_param = true;
224
  break;
225
  }
226
+ sparams.temp = std::stof(argv[i]);
227
  } else if (arg == "--tfs") {
228
  if (++i >= argc) {
229
  invalid_param = true;
230
  break;
231
  }
232
+ sparams.tfs_z = std::stof(argv[i]);
233
  } else if (arg == "--typical") {
234
  if (++i >= argc) {
235
  invalid_param = true;
236
  break;
237
  }
238
+ sparams.typical_p = std::stof(argv[i]);
239
  } else if (arg == "--repeat-last-n") {
240
  if (++i >= argc) {
241
  invalid_param = true;
242
  break;
243
  }
244
+ sparams.repeat_last_n = std::stoi(argv[i]);
245
  } else if (arg == "--repeat-penalty") {
246
  if (++i >= argc) {
247
  invalid_param = true;
248
  break;
249
  }
250
+ sparams.repeat_penalty = std::stof(argv[i]);
251
  } else if (arg == "--frequency-penalty") {
252
  if (++i >= argc) {
253
  invalid_param = true;
254
  break;
255
  }
256
+ sparams.frequency_penalty = std::stof(argv[i]);
257
  } else if (arg == "--presence-penalty") {
258
  if (++i >= argc) {
259
  invalid_param = true;
260
  break;
261
  }
262
+ sparams.presence_penalty = std::stof(argv[i]);
263
  } else if (arg == "--mirostat") {
264
  if (++i >= argc) {
265
  invalid_param = true;
266
  break;
267
  }
268
+ sparams.mirostat = std::stoi(argv[i]);
269
  } else if (arg == "--mirostat-lr") {
270
  if (++i >= argc) {
271
  invalid_param = true;
272
  break;
273
  }
274
+ sparams.mirostat_eta = std::stof(argv[i]);
275
  } else if (arg == "--mirostat-ent") {
276
  if (++i >= argc) {
277
  invalid_param = true;
278
  break;
279
  }
280
+ sparams.mirostat_tau = std::stof(argv[i]);
281
  } else if (arg == "--cfg-negative-prompt") {
282
  if (++i >= argc) {
283
  invalid_param = true;
284
  break;
285
  }
286
+ sparams.cfg_negative_prompt = argv[i];
287
  } else if (arg == "--cfg-negative-prompt-file") {
288
  if (++i >= argc) {
289
  invalid_param = true;
 
295
  invalid_param = true;
296
  break;
297
  }
298
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
299
+ if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
300
+ sparams.cfg_negative_prompt.pop_back();
301
  }
302
  } else if (arg == "--cfg-scale") {
303
  if (++i >= argc) {
304
  invalid_param = true;
305
  break;
306
  }
307
+ sparams.cfg_scale = std::stof(argv[i]);
308
  } else if (arg == "-b" || arg == "--batch-size") {
309
  if (++i >= argc) {
310
  invalid_param = true;
 
513
  } else if (arg == "--ignore-eos") {
514
  params.ignore_eos = true;
515
  } else if (arg == "--no-penalize-nl") {
516
+ sparams.penalize_nl = false;
517
  } else if (arg == "-l" || arg == "--logit-bias") {
518
  if (++i >= argc) {
519
  invalid_param = true;
 
525
  std::string value_str;
526
  try {
527
  if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
528
+ sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
529
  } else {
530
  throw std::exception();
531
  }
 
628
  }
629
 
630
  void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
631
+ const llama_sampling_params & sparams = params.sampling_params;
632
+
633
  printf("usage: %s [options]\n", argv[0]);
634
  printf("\n");
635
  printf("options:\n");
 
662
  printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
663
  printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
664
  printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
665
+ printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
666
+ printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
667
+ printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
668
+ printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
669
+ printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
670
+ printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
671
+ printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
672
+ printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
673
  printf(" --mirostat N use Mirostat sampling.\n");
674
  printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
675
+ printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
676
+ printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
677
+ printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
678
  printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
679
  printf(" modifies the likelihood of token appearing in the completion,\n");
680
  printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
 
685
  printf(" negative prompt to use for guidance. (default: empty)\n");
686
  printf(" --cfg-negative-prompt-file FNAME\n");
687
  printf(" negative prompt file to use for guidance. (default: empty)\n");
688
+ printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
689
  printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
690
  printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
691
  printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
 
693
  printf(" --no-penalize-nl do not penalize newline token\n");
694
  printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
695
  printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
696
+ printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
697
  printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
698
  printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
699
  printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
 
843
  }
844
 
845
  if (params.ignore_eos) {
846
+ params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
847
  }
848
 
849
  {
 
935
  return result;
936
  }
937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
  //
939
  // YAML utils
940
  //
 
1086
 
1087
  void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
1088
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1089
+ const llama_sampling_params & sparams = params.sampling_params;
1090
+
1091
  fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
1092
  fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
1093
  fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
 
1134
 
1135
  fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
1136
  fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
1137
+ dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
1138
+ fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
1139
  fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
1140
  fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
1141
  fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
1142
  fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
1143
  fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
1144
+ fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
1145
  dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
1146
  fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
1147
  fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
1148
  fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
1149
 
1150
+ const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
1151
+ const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
1152
  fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
1153
 
1154
  dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
 
1161
  fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
1162
 
1163
  fprintf(stream, "logit_bias:\n");
1164
+ for (std::pair<llama_token, float> lb : sparams.logit_bias) {
1165
  if (ignore_eos && lb.first == logit_bias_eos->first) {
1166
  continue;
1167
  }
 
1185
  fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
1186
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1187
  fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
1188
+ fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
1189
+ fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
1190
+ fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
1191
  fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
1192
  fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
1193
  fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
1194
  fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
1195
  fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
1196
  fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
1197
+ fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
1198
  fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
1199
  fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
1200
+ fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
1201
  fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
1202
  fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
1203
  fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
1204
+ fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
1205
  dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
1206
  fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
1207
  fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
1208
  fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
1209
  dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
1210
  fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
1211
+ fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
1212
 
1213
  fprintf(stream, "reverse_prompt:\n");
1214
  for (std::string ap : params.antiprompt) {
 
1226
  fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
1227
  fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
1228
  fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
1229
+ fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
1230
 
1231
  const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
1232
  dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
1233
 
1234
+ fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
1235
  fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
1236
+ fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
1237
+ fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
1238
+ fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
1239
  fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
1240
  }
common/common.h CHANGED
@@ -4,6 +4,8 @@
4
 
5
  #include "llama.h"
6
 
 
 
7
  #define LOG_NO_FILE_LINE_FUNCTION
8
  #include "log.h"
9
 
@@ -49,7 +51,6 @@ struct gpt_params {
49
  int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
50
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
51
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
52
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
53
  int32_t n_beams = 0; // if non-zero then use beam search of given width.
54
  float rope_freq_base = 0.0f; // RoPE base frequency
55
  float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
@@ -67,13 +68,8 @@ struct gpt_params {
67
  int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
68
  float mirostat_tau = 5.00f; // target entropy
69
  float mirostat_eta = 0.10f; // learning rate
70
-
71
- std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
72
-
73
- // Classifier-Free Guidance
74
- // https://arxiv.org/abs/2306.17806
75
- std::string cfg_negative_prompt; // string to help guidance
76
- float cfg_scale = 1.f; // How strong is guidance
77
 
78
  std::string model = "models/7B/ggml-model-f16.gguf"; // model path
79
  std::string model_draft = ""; // draft model for speculative decoding
@@ -115,7 +111,6 @@ struct gpt_params {
115
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
116
  bool ignore_eos = false; // ignore generated EOS tokens
117
  bool instruct = false; // instruction mode (used for Alpaca models)
118
- bool penalize_nl = true; // consider newlines as a repeatable token
119
  bool logits_all = false; // return logits for all tokens in the batch
120
  bool use_mmap = true; // use mmap for faster loads
121
  bool use_mlock = false; // use mlock to keep model in memory
@@ -180,36 +175,6 @@ std::string llama_detokenize_bpe(
180
  llama_context * ctx,
181
  const std::vector<llama_token> & tokens);
182
 
183
- //
184
- // Sampling utils
185
- //
186
-
187
- // this is a common sampling function used across the examples for convenience
188
- // it can serve as a starting point for implementing your own sampling function
189
- //
190
- // required:
191
- // - ctx: context to use for sampling
192
- // - params: sampling parameters
193
- //
194
- // optional:
195
- // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
196
- // - grammar: grammar to use for sampling, ignore if NULL
197
- // - last_tokens: needed for repetition penalty, ignore if empty
198
- // - idx: sample from llama_get_logits_ith(ctx, idx)
199
- //
200
- // returns:
201
- // - token: sampled token
202
- // - candidates: vector of candidate tokens
203
- //
204
- llama_token llama_sample_token(
205
- struct llama_context * ctx,
206
- struct llama_context * ctx_guidance,
207
- struct llama_grammar * grammar,
208
- const struct gpt_params & params,
209
- const std::vector<llama_token> & last_tokens,
210
- std::vector<llama_token_data> & candidates,
211
- int idx = 0);
212
-
213
  //
214
  // YAML utils
215
  //
 
4
 
5
  #include "llama.h"
6
 
7
+ #include "sampling.h"
8
+
9
  #define LOG_NO_FILE_LINE_FUNCTION
10
  #include "log.h"
11
 
 
51
  int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
52
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
53
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
 
54
  int32_t n_beams = 0; // if non-zero then use beam search of given width.
55
  float rope_freq_base = 0.0f; // RoPE base frequency
56
  float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
 
68
  int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
69
  float mirostat_tau = 5.00f; // target entropy
70
  float mirostat_eta = 0.10f; // learning rate
71
+ // // sampling parameters
72
+ struct llama_sampling_params sampling_params;
 
 
 
 
 
73
 
74
  std::string model = "models/7B/ggml-model-f16.gguf"; // model path
75
  std::string model_draft = ""; // draft model for speculative decoding
 
111
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
112
  bool ignore_eos = false; // ignore generated EOS tokens
113
  bool instruct = false; // instruction mode (used for Alpaca models)
 
114
  bool logits_all = false; // return logits for all tokens in the batch
115
  bool use_mmap = true; // use mmap for faster loads
116
  bool use_mlock = false; // use mlock to keep model in memory
 
175
  llama_context * ctx,
176
  const std::vector<llama_token> & tokens);
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  //
179
  // YAML utils
180
  //
common/sampling.cpp ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "sampling.h"
2
+
3
+ llama_sampling_context::~llama_sampling_context() {
4
+ for (auto & it : sequence_contexts) {
5
+ if (it.second.grammar != NULL) {
6
+ llama_grammar_free(it.second.grammar);
7
+ it.second.grammar = NULL;
8
+ }
9
+ }
10
+ }
11
+
12
+ llama_sampling_context llama_sampling_context_init(
13
+ const struct gpt_params & params,
14
+ llama_grammar * grammar) {
15
+ llama_sampling_context result;
16
+
17
+ result.params = params.sampling_params;
18
+ result.grammar = grammar;
19
+ return result;
20
+ }
21
+
22
+ // Note: Creates the context if it doesn't exist, so this always return something.
23
+ llama_sampler_sequence_context & llama_sampling_get_sequence_context(
24
+ llama_sampling_context & ctx_sampling,
25
+ const llama_seq_id seq) {
26
+ const auto it = ctx_sampling.sequence_contexts.find(seq);
27
+ if (it != ctx_sampling.sequence_contexts.end()) {
28
+ return it->second;
29
+ }
30
+ llama_sampler_sequence_context new_ctx = {
31
+ 2.0f * ctx_sampling.params.mirostat_tau,
32
+ ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
33
+ };
34
+ return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
35
+ }
36
+
37
+ bool llama_sampling_context_reset(
38
+ llama_sampling_context & ctx_sampling,
39
+ const llama_seq_id seq) {
40
+ const auto it = ctx_sampling.sequence_contexts.find(seq);
41
+ if (it == ctx_sampling.sequence_contexts.end()) return false;
42
+ if (it->second.grammar != NULL) {
43
+ llama_grammar_free(it->second.grammar);
44
+ it->second.grammar = NULL;
45
+ }
46
+ ctx_sampling.sequence_contexts.erase(it);
47
+ return true;
48
+ }
49
+
50
+ llama_token llama_sampling_sample(
51
+ struct llama_context * ctx,
52
+ struct llama_context * ctx_guidance,
53
+ struct llama_sampling_context & ctx_sampling,
54
+ const std::vector<llama_token> & last_tokens,
55
+ std::vector<llama_token_data> & candidates,
56
+ const int idx,
57
+ llama_seq_id seq) {
58
+ const int n_ctx = llama_n_ctx(ctx);
59
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
60
+
61
+ const llama_sampling_params & params = ctx_sampling.params;
62
+ const float temp = params.temp;
63
+ const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
64
+ const float top_p = params.top_p;
65
+ const float tfs_z = params.tfs_z;
66
+ const float typical_p = params.typical_p;
67
+ const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
68
+ const float repeat_penalty = params.repeat_penalty;
69
+ const float alpha_presence = params.presence_penalty;
70
+ const float alpha_frequency = params.frequency_penalty;
71
+ const int mirostat = params.mirostat;
72
+ const float mirostat_tau = params.mirostat_tau;
73
+ const float mirostat_eta = params.mirostat_eta;
74
+ const bool penalize_nl = params.penalize_nl;
75
+
76
+ llama_token id = 0;
77
+
78
+ float * logits = llama_get_logits_ith(ctx, idx);
79
+
80
+ // Apply params.logit_bias map
81
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
82
+ logits[it->first] += it->second;
83
+ }
84
+
85
+ candidates.clear();
86
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
87
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
88
+ }
89
+
90
+ llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
91
+
92
+ if (ctx_guidance) {
93
+ llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
94
+ }
95
+
96
+ // apply penalties
97
+ if (!last_tokens.empty()) {
98
+ const float nl_logit = logits[llama_token_nl(ctx)];
99
+ const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
100
+
101
+ llama_sample_repetition_penalty(ctx, &cur_p,
102
+ last_tokens.data() + last_tokens.size() - last_n_repeat,
103
+ last_n_repeat, repeat_penalty);
104
+ llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
105
+ last_tokens.data() + last_tokens.size() - last_n_repeat,
106
+ last_n_repeat, alpha_frequency, alpha_presence);
107
+
108
+ if (!penalize_nl) {
109
+ for (size_t idx = 0; idx < cur_p.size; idx++) {
110
+ if (cur_p.data[idx].id == llama_token_nl(ctx)) {
111
+ cur_p.data[idx].logit = nl_logit;
112
+ break;
113
+ }
114
+ }
115
+ }
116
+ }
117
+
118
+ llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
119
+
120
+ if (ctx_seq.grammar != NULL) {
121
+ llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
122
+ }
123
+
124
+ if (temp <= 0) {
125
+ // Greedy sampling
126
+ id = llama_sample_token_greedy(ctx, &cur_p);
127
+ } else {
128
+ if (mirostat == 1) {
129
+ const int mirostat_m = 100;
130
+ llama_sample_temp(ctx, &cur_p, temp);
131
+ id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
132
+ } else if (mirostat == 2) {
133
+ llama_sample_temp(ctx, &cur_p, temp);
134
+ id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
135
+ } else {
136
+ // Temperature sampling
137
+ size_t min_keep = std::max(1, params.n_probs);
138
+ llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
139
+ llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
140
+ llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
141
+ llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
142
+ llama_sample_temp(ctx, &cur_p, temp);
143
+
144
+ {
145
+ const int n_top = 10;
146
+ LOG("top %d candidates:\n", n_top);
147
+
148
+ for (int i = 0; i < n_top; i++) {
149
+ const llama_token id = cur_p.data[i].id;
150
+ (void)id; // To avoid a warning that id is unused when logging is disabled.
151
+ LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
152
+ }
153
+ }
154
+
155
+ id = llama_sample_token(ctx, &cur_p);
156
+
157
+ LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
158
+ }
159
+ }
160
+
161
+ if (ctx_seq.grammar != NULL) {
162
+ llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
163
+ }
164
+
165
+ return id;
166
+ }
common/sampling.h ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <string>
6
+ #include <vector>
7
+ #include <unordered_map>
8
+
9
+ // sampling parameters
10
+ typedef struct llama_sampling_params {
11
+ int32_t top_k = 40; // <= 0 to use vocab size
12
+ float top_p = 0.95f; // 1.0 = disabled
13
+ float tfs_z = 1.00f; // 1.0 = disabled
14
+ float typical_p = 1.00f; // 1.0 = disabled
15
+ float temp = 0.80f; // 1.0 = disabled
16
+ float repeat_penalty = 1.10f; // 1.0 = disabled
17
+ int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
18
+ float frequency_penalty = 0.00f; // 0.0 = disabled
19
+ float presence_penalty = 0.00f; // 0.0 = disabled
20
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
21
+ float mirostat_tau = 5.00f; // target entropy
22
+ float mirostat_eta = 0.10f; // learning rate
23
+
24
+ bool penalize_nl = true; // consider newlines as a repeatable token
25
+
26
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
27
+
28
+ // Classifier-Free Guidance
29
+ // https://arxiv.org/abs/2306.17806
30
+ std::string cfg_negative_prompt; // string to help guidance
31
+ float cfg_scale = 1.f; // How strong is guidance
32
+
33
+ std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
34
+
35
+ } llama_sampling_params;
36
+
37
+ // per-sequence sampler context
38
+ typedef struct llama_sampler_sequence_context {
39
+ float mirostat_mu; // mirostat sampler state
40
+ llama_grammar * grammar;
41
+ } llama_sampler_sequence_context;
42
+
43
+ // general sampler context
44
+ typedef struct llama_sampling_context {
45
+ ~llama_sampling_context();
46
+
47
+ // parameters that will be used for sampling and when creating
48
+ // new llama_sampler_sequence_context instances
49
+ llama_sampling_params params;
50
+
51
+ // map of sequence ids to sampler contexts
52
+ std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
53
+
54
+ // when non-NULL, new instances of llama_sampler_sequence_context
55
+ // will get a copy of the grammar here
56
+ // note: only the pointer is stored here, it is not a copy of
57
+ // the grammar and shouldn't be freed
58
+ llama_grammar * grammar;
59
+ } llama_sampling_context;
60
+
61
+ #include "common.h"
62
+
63
+ // Create a new sampling context instance.
64
+ llama_sampling_context llama_sampling_context_init(
65
+ const struct gpt_params & params,
66
+ llama_grammar * grammar = NULL);
67
+
68
+ // Fetches the sampler context for the specified sequence id (defaults to 0).
69
+ // If the context for that sequence id doesn't already exist, it will be created with
70
+ // default values based on the parameters in the ctx_sampling argument.
71
+ llama_sampler_sequence_context & llama_sampling_get_sequence_context(
72
+ llama_sampling_context & ctx_sampling,
73
+ const llama_seq_id seq = 0);
74
+
75
+ // Reset the sampler context for the supplied sequence id (defaults to 0).
76
+ // This is necessary to reuse a sequence id or free memory used by sequences
77
+ // that are no longer required.
78
+ bool llama_sampling_context_reset(
79
+ llama_sampling_context & ctx_sampling,
80
+ const llama_seq_id seq = 0);
81
+
82
+ // this is a common sampling function used across the examples for convenience
83
+ // it can serve as a starting point for implementing your own sampling function
84
+ // Note: When using multiple sequences, it is the caller's responsibility to call
85
+ // llama_sampling_context_reset when a sequence ends
86
+ //
87
+ // required:
88
+ // - ctx: context to use for sampling
89
+ // - ctx_sampling: sampling-specific context
90
+ //
91
+ // optional:
92
+ // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
93
+ // - last_tokens: needed for repetition penalty, ignore if empty
94
+ // - idx: sample from llama_get_logits_ith(ctx, idx)
95
+ // - seq: sequence id to associate sampler state with
96
+ //
97
+ // returns:
98
+ // - token: sampled token
99
+ // - candidates: vector of candidate tokens
100
+ //
101
+ llama_token llama_sampling_sample(
102
+ struct llama_context * ctx,
103
+ struct llama_context * ctx_guidance,
104
+ struct llama_sampling_context & ctx_sampling,
105
+ const std::vector<llama_token> & last_tokens,
106
+ std::vector<llama_token_data> & candidates,
107
+ const int idx = 0,
108
+ llama_seq_id seq = 0);
convert-bloom-hf-to-gguf.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF bloom --> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import re
10
+ import struct
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import numpy as np
16
+ import torch
17
+ from transformers import AutoTokenizer # type: ignore[import]
18
+
19
+ if 'NO_LOCAL_GGUF' not in os.environ:
20
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
21
+ import gguf
22
+
23
+
24
+ def count_model_parts(dir_model: Path) -> int:
25
+ num_parts = 0
26
+ for filename in os.listdir(dir_model):
27
+ if filename.startswith("pytorch_model-"):
28
+ num_parts += 1
29
+
30
+ if num_parts > 0:
31
+ print("gguf: found " + str(num_parts) + " model parts")
32
+ return num_parts
33
+
34
+
35
+ # Supported Models:
36
+ # https://huggingface.co/bigscience/bloom-1b7
37
+ # https://huggingface.co/bigscience/bloom-3b
38
+ # https://huggingface.co/bigscience/bloom-7b1
39
+ # https://huggingface.co/Langboat/bloom-1b4-zh
40
+ def parse_args() -> argparse.Namespace:
41
+ parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
42
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
43
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
44
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
45
+ parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
46
+ return parser.parse_args()
47
+
48
+ args = parse_args()
49
+
50
+ dir_model = args.model
51
+ ftype = args.ftype
52
+ if not dir_model.is_dir():
53
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
54
+ sys.exit(1)
55
+
56
+ # possible tensor data types
57
+ # ftype == 0 -> float32
58
+ # ftype == 1 -> float16
59
+
60
+ # map from ftype to string
61
+ ftype_str = ["f32", "f16"]
62
+
63
+ if args.outfile is not None:
64
+ fname_out = args.outfile
65
+ else:
66
+ # output in the same directory as the model by default
67
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
68
+
69
+ print("gguf: loading model "+dir_model.name)
70
+
71
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
72
+ hparams = json.load(f)
73
+
74
+ if hparams["architectures"][0] != "BloomForCausalLM":
75
+ print("Model architecture not supported: " + hparams["architectures"][0])
76
+ sys.exit(1)
77
+
78
+ # get number of model parts
79
+ num_parts = count_model_parts(dir_model)
80
+
81
+ ARCH=gguf.MODEL_ARCH.BLOOM
82
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
83
+
84
+ print("gguf: get model metadata")
85
+
86
+ block_count = hparams["n_layer"]
87
+
88
+ gguf_writer.add_name("Bloom")
89
+ n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
90
+ n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
91
+ gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
92
+ gguf_writer.add_embedding_length(n_embed)
93
+ gguf_writer.add_feed_forward_length(4 * n_embed)
94
+ gguf_writer.add_block_count(block_count)
95
+ gguf_writer.add_head_count(n_head)
96
+ gguf_writer.add_head_count_kv(n_head)
97
+ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
98
+ gguf_writer.add_file_type(ftype)
99
+
100
+ # TOKENIZATION
101
+
102
+ print("gguf: get tokenizer metadata")
103
+
104
+ tokens: list[bytearray] = []
105
+ scores: list[float] = []
106
+ toktypes: list[int] = []
107
+
108
+ # gpt2 tokenizer
109
+ gguf_writer.add_tokenizer_model("gpt2")
110
+
111
+ print("gguf: get gpt2 tokenizer vocab")
112
+
113
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
114
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
115
+
116
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
117
+ # This causes downstream issues with mismatched tensor sizes when running the inference
118
+ vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
119
+ assert max(tokenizer.vocab.values()) < vocab_size
120
+
121
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
122
+
123
+ for i in range(vocab_size):
124
+ tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
125
+ scores.append(0.0) # dummy
126
+ toktypes.append(gguf.TokenType.NORMAL)
127
+
128
+ gguf_writer.add_token_list(tokens)
129
+ gguf_writer.add_token_scores(scores)
130
+ gguf_writer.add_token_types(toktypes)
131
+
132
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
133
+ special_vocab.add_to_gguf(gguf_writer)
134
+
135
+ # TENSORS
136
+
137
+ tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
138
+
139
+ # params for qkv transform
140
+ n_head_kv = hparams.get("n_head_kv", n_head)
141
+ head_dim = n_embed // n_head
142
+
143
+ # tensor info
144
+ print("gguf: get tensor metadata")
145
+
146
+ if num_parts == 0:
147
+ part_names = iter(("pytorch_model.bin",))
148
+ else:
149
+ part_names = (
150
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
151
+ )
152
+
153
+ for part_name in part_names:
154
+ if args.vocab_only:
155
+ break
156
+ print("gguf: loading model part '" + part_name + "'")
157
+ model_part = torch.load(dir_model / part_name, map_location="cpu")
158
+
159
+ has_lm_head = True
160
+ if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
161
+ has_lm_head = False
162
+
163
+ for original_name in model_part.keys():
164
+ data = model_part[original_name]
165
+ name = re.sub(r'transformer\.', '', original_name)
166
+
167
+ old_dtype = data.dtype
168
+
169
+ # convert any unsupported data types to float32
170
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
171
+ data = data.to(torch.float32)
172
+
173
+ data = data.squeeze().numpy()
174
+
175
+ if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
176
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
177
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
178
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
179
+ qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
180
+ data = np.concatenate(
181
+ (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
182
+ qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
183
+ qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
184
+ axis=0
185
+ )
186
+ print("re-format attention.linear_qkv.weight")
187
+ elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
188
+ qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
189
+ data = np.concatenate(
190
+ (qkv_bias[:, 0, :].reshape((n_embed,)),
191
+ qkv_bias[:, 1, :].reshape((n_embed,)),
192
+ qkv_bias[:, 2, :].reshape((n_embed,))),
193
+ axis=0
194
+ )
195
+ print("re-format attention.linear_qkv.bias")
196
+
197
+ # map tensor names
198
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
199
+ if new_name is None:
200
+ print("Can not map tensor '" + name + "'")
201
+ sys.exit()
202
+
203
+ n_dims = len(data.shape)
204
+ data_dtype = data.dtype
205
+
206
+ # if f32 desired, convert any float16 to float32
207
+ if ftype == 0 and data_dtype == np.float16:
208
+ data = data.astype(np.float32)
209
+
210
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
211
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
212
+ data = data.astype(np.float32)
213
+
214
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
215
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
216
+ data = data.astype(np.float16)
217
+
218
+ print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
219
+
220
+ gguf_writer.add_tensor(new_name, data)
221
+
222
+ if not has_lm_head and name == "word_embeddings.weight":
223
+ gguf_writer.add_tensor("output.weight", data)
224
+ print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
225
+
226
+
227
+ print("gguf: write header")
228
+ gguf_writer.write_header_to_file()
229
+ print("gguf: write metadata")
230
+ gguf_writer.write_kv_data_to_file()
231
+ if not args.vocab_only:
232
+ print("gguf: write tensors")
233
+ gguf_writer.write_tensors_to_file()
234
+
235
+ gguf_writer.close()
236
+
237
+ print(f"gguf: model successfully exported to '{fname_out}'")
238
+ print("")
convert-mpt-hf-to-gguf.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF mpt--> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import struct
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ import torch
16
+ from transformers import AutoTokenizer # type: ignore[import]
17
+
18
+ if 'NO_LOCAL_GGUF' not in os.environ:
19
+ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20
+ import gguf
21
+
22
+
23
+ def count_model_parts(dir_model: Path) -> int:
24
+ num_parts = 0
25
+ for filename in os.listdir(dir_model):
26
+ if filename.startswith("pytorch_model-"):
27
+ num_parts += 1
28
+
29
+ if num_parts > 0:
30
+ print("gguf: found " + str(num_parts) + " model parts")
31
+ return num_parts
32
+
33
+
34
+ def parse_args() -> argparse.Namespace:
35
+ parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
36
+ parser.add_argument(
37
+ "--vocab-only", action="store_true",
38
+ help="extract only the vocab",
39
+ )
40
+ parser.add_argument(
41
+ "--outfile", type=Path,
42
+ help="path to write to; default: based on input",
43
+ )
44
+ parser.add_argument(
45
+ "model", type=Path,
46
+ help="directory containing model file, or model file itself (*.bin)",
47
+ )
48
+ parser.add_argument(
49
+ "ftype", type=int, choices=[0, 1], default=1, nargs='?',
50
+ help="output format - use 0 for float32, 1 for float16",
51
+ )
52
+ return parser.parse_args()
53
+
54
+ args = parse_args()
55
+
56
+ dir_model = args.model
57
+ ftype = args.ftype
58
+ if not dir_model.is_dir():
59
+ print(f'Error: {args.model} is not a directory', file = sys.stderr)
60
+ sys.exit(1)
61
+
62
+ # possible tensor data types
63
+ # ftype == 0 -> float32
64
+ # ftype == 1 -> float16
65
+
66
+ # map from ftype to string
67
+ ftype_str = ["f32", "f16"]
68
+
69
+ if args.outfile is not None:
70
+ fname_out = args.outfile
71
+ else:
72
+ # output in the same directory as the model by default
73
+ fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
74
+
75
+ print("gguf: loading model "+dir_model.name)
76
+
77
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
78
+ hparams = json.load(f)
79
+
80
+ if hparams["architectures"][0] != "MPTForCausalLM":
81
+ print("Model architecture not supported: " + hparams["architectures"][0])
82
+
83
+ sys.exit()
84
+
85
+ # get number of model parts
86
+ num_parts = count_model_parts(dir_model)
87
+
88
+ ARCH=gguf.MODEL_ARCH.MPT
89
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
90
+
91
+ print("gguf: get model metadata")
92
+
93
+ block_count = hparams["n_layers"]
94
+
95
+ gguf_writer.add_name(dir_model.name)
96
+ gguf_writer.add_context_length(hparams["max_seq_len"])
97
+ gguf_writer.add_embedding_length(hparams["d_model"])
98
+ gguf_writer.add_block_count(block_count)
99
+ gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
100
+ gguf_writer.add_head_count(hparams["n_heads"])
101
+ gguf_writer.add_layer_norm_eps(1e-05)
102
+ if hparams["attn_config"]["clip_qkv"] is not None:
103
+ gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
104
+ gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
105
+
106
+ # TOKENIZATION
107
+
108
+ print("gguf: get tokenizer metadata")
109
+
110
+ tokens: list[bytearray] = []
111
+ scores: list[float] = []
112
+ toktypes: list[int] = []
113
+
114
+ # gpt2 tokenizer
115
+ gguf_writer.add_tokenizer_model("gpt2")
116
+
117
+ print("gguf: get gpt2 tokenizer vocab")
118
+
119
+ # MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
120
+ # there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
121
+ # accomodate some "reserved" tokens; this is causing problems down the line in
122
+ # llama.cpp, so we pad the vocab with dummy tokens:
123
+
124
+ vocab_size = hparams["vocab_size"]
125
+
126
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
127
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
128
+
129
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
130
+
131
+ for i in range(vocab_size):
132
+ tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
133
+ scores.append(0.0) # dummy
134
+ toktypes.append(gguf.TokenType.NORMAL)
135
+
136
+ gguf_writer.add_token_list(tokens)
137
+ gguf_writer.add_token_scores(scores)
138
+ gguf_writer.add_token_types(toktypes)
139
+
140
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
141
+ special_vocab.add_to_gguf(gguf_writer)
142
+
143
+ # TENSORS
144
+
145
+ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
146
+
147
+ # tensor info
148
+ print("gguf: get tensor metadata")
149
+
150
+ if num_parts == 0:
151
+ part_names = iter(("pytorch_model.bin",))
152
+ else:
153
+ part_names = (
154
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
155
+ )
156
+
157
+ for part_name in part_names:
158
+ if args.vocab_only:
159
+ break
160
+ print("gguf: loading model part '" + part_name + "'")
161
+ model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
162
+
163
+ for name in model_part.keys():
164
+ data = model_part[name]
165
+
166
+ old_dtype = data.dtype
167
+
168
+ # convert any unsupported data types to float32
169
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
170
+ data = data.to(torch.float32)
171
+
172
+ data = data.squeeze().numpy()
173
+
174
+ # map tensor names
175
+ new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
176
+ if new_name is None:
177
+ print("Cannot map tensor '" + name + "'")
178
+ continue # for the sake of compatibility with some old published models, don't quit
179
+ sys.exit()
180
+
181
+ n_dims = len(data.shape)
182
+ data_dtype = data.dtype
183
+
184
+ # if f32 desired, convert any float16 to float32
185
+ if ftype == 0 and data_dtype == np.float16:
186
+ data = data.astype(np.float32)
187
+
188
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
189
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
190
+ data = data.astype(np.float32)
191
+
192
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
193
+ if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
194
+ data = data.astype(np.float16)
195
+
196
+ print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
197
+
198
+ gguf_writer.add_tensor(new_name, data)
199
+
200
+ # note: MPT output is tied to (same as) wte in original model;
201
+ # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
202
+ if new_name == "token_embd.weight":
203
+ gguf_writer.add_tensor("output.weight", data)
204
+
205
+ print("gguf: write header")
206
+ gguf_writer.write_header_to_file()
207
+ print("gguf: write metadata")
208
+ gguf_writer.write_kv_data_to_file()
209
+ if not args.vocab_only:
210
+ print("gguf: write tensors")
211
+ gguf_writer.write_tensors_to_file()
212
+
213
+ gguf_writer.close()
214
+
215
+ print(f"gguf: model successfully exported to '{fname_out}'")
216
+ print("")
convert-refact-hf-to-gguf.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # HF refact--> gguf conversion
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+ import torch
14
+ from transformers import AutoTokenizer # type: ignore[import]
15
+
16
+ if "NO_LOCAL_GGUF" not in os.environ:
17
+ sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
18
+ import gguf
19
+
20
+ def count_model_parts(dir_model: Path) -> int:
21
+ num_parts = 0
22
+ for filename in os.listdir(dir_model):
23
+ if filename.startswith("pytorch_model-"):
24
+ num_parts += 1
25
+
26
+ if num_parts > 0:
27
+ print("gguf: found " + str(num_parts) + " model parts")
28
+ return num_parts
29
+
30
+
31
+ def parse_args() -> argparse.Namespace:
32
+ parser = argparse.ArgumentParser(
33
+ description="Convert a Refact model to a GGML compatible file"
34
+ )
35
+ parser.add_argument(
36
+ "--vocab-only",
37
+ action="store_true",
38
+ help="extract only the vocab",
39
+ )
40
+ parser.add_argument(
41
+ "--outfile",
42
+ type=Path,
43
+ help="path to write to; default: based on input",
44
+ )
45
+ parser.add_argument(
46
+ "model",
47
+ type=Path,
48
+ help="directory containing model file, or model file itself (*.bin)",
49
+ )
50
+ parser.add_argument(
51
+ "ftype",
52
+ type=int,
53
+ choices=[0, 1],
54
+ default=1,
55
+ nargs="?",
56
+ help="output format - use 0 for float32, 1 for float16",
57
+ )
58
+ return parser.parse_args()
59
+
60
+
61
+ args = parse_args()
62
+
63
+ dir_model = args.model
64
+ ftype = args.ftype
65
+ if not dir_model.is_dir():
66
+ print(f"Error: {args.model} is not a directory", file=sys.stderr)
67
+ sys.exit(1)
68
+
69
+ # possible tensor data types
70
+ # ftype == 0 -> float32
71
+ # ftype == 1 -> float16
72
+
73
+ # map from ftype to string
74
+ ftype_str = ["f32", "f16"]
75
+
76
+ if args.outfile is not None:
77
+ fname_out = args.outfile
78
+ else:
79
+ # output in the same directory as the model by default
80
+ fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
81
+
82
+ print("gguf: loading model " + dir_model.name)
83
+
84
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
85
+ hparams = json.load(f)
86
+
87
+ if hparams["architectures"][0] != "GPTRefactForCausalLM":
88
+ print("Model architecture not supported: " + hparams["architectures"][0])
89
+
90
+ sys.exit(1)
91
+
92
+ # get number of model parts
93
+ num_parts = count_model_parts(dir_model)
94
+
95
+ ARCH = gguf.MODEL_ARCH.REFACT
96
+ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
97
+
98
+ print("gguf: get model metadata")
99
+
100
+ # Get refact feed forward dimension
101
+ hidden_dim = hparams["n_embd"]
102
+ inner_dim = 4 * hidden_dim
103
+ hidden_dim = int(2 * inner_dim / 3)
104
+ multiple_of = 256
105
+ ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
106
+
107
+ block_count = hparams["n_layer"]
108
+
109
+ gguf_writer.add_name("Refact")
110
+ # refact uses Alibi. So this is from config.json which might be used by training.
111
+ gguf_writer.add_context_length(hparams["n_positions"])
112
+ gguf_writer.add_embedding_length(hparams["n_embd"])
113
+
114
+ gguf_writer.add_feed_forward_length(ff_dim)
115
+ gguf_writer.add_block_count(block_count)
116
+ gguf_writer.add_head_count(hparams["n_head"])
117
+ gguf_writer.add_head_count_kv(1)
118
+ gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
119
+ gguf_writer.add_file_type(ftype)
120
+
121
+ # TOKENIZATION
122
+
123
+ print("gguf: get tokenizer metadata")
124
+
125
+ tokens: list[bytearray] = []
126
+ scores: list[float] = []
127
+ toktypes: list[int] = []
128
+
129
+ # gpt2 tokenizer
130
+ gguf_writer.add_tokenizer_model("gpt2")
131
+
132
+ print("gguf: get gpt2 tokenizer vocab")
133
+
134
+ # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
135
+ tokenizer = AutoTokenizer.from_pretrained(dir_model)
136
+
137
+ # The number of tokens in tokenizer.json can differ from the expected vocab size.
138
+ # This causes downstream issues with mismatched tensor sizes when running the inference
139
+ vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
140
+ assert max(tokenizer.vocab.values()) < vocab_size
141
+
142
+ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
143
+
144
+ for i in range(vocab_size):
145
+ tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
146
+ scores.append(0.0) # dummy
147
+ toktypes.append(gguf.TokenType.NORMAL)
148
+
149
+ gguf_writer.add_token_list(tokens)
150
+ gguf_writer.add_token_scores(scores)
151
+ gguf_writer.add_token_types(toktypes)
152
+
153
+ special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
154
+ special_vocab.add_to_gguf(gguf_writer)
155
+
156
+ # TENSORS
157
+
158
+ tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
159
+
160
+ # params for qkv transform
161
+ n_head = hparams["n_head"]
162
+ n_head_kv = 1
163
+
164
+ head_dim = hparams["n_embd"] // n_head
165
+
166
+ # tensor info
167
+ print("gguf: get tensor metadata")
168
+
169
+ if num_parts == 0:
170
+ part_names = iter(("pytorch_model.bin",))
171
+ else:
172
+ part_names = (
173
+ f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
174
+ )
175
+ for part_name in part_names:
176
+ if args.vocab_only:
177
+ break
178
+ print("gguf: loading model part '" + part_name + "'")
179
+ model_part = torch.load(dir_model / part_name, map_location="cpu")
180
+
181
+ for i in range(block_count):
182
+ if f"transformer.h.{i}.attn.kv.weight" in model_part:
183
+ data = model_part[f"transformer.h.{i}.attn.kv.weight"]
184
+ model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
185
+ : n_head_kv * head_dim
186
+ ]
187
+ model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
188
+ n_head_kv * head_dim :
189
+ ]
190
+ del model_part[f"transformer.h.{i}.attn.kv.weight"]
191
+ if f"transformer.h.{i}.attn.q.weight" in model_part:
192
+ model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
193
+ f"transformer.h.{i}.attn.q.weight"
194
+ ]
195
+ del model_part[f"transformer.h.{i}.attn.q.weight"]
196
+ if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
197
+ data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
198
+ model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
199
+ model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
200
+ del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
201
+
202
+ for name in model_part.keys():
203
+ data = model_part[name]
204
+
205
+ old_dtype = data.dtype
206
+
207
+ # convert any unsupported data types to float32
208
+ if data.dtype != torch.float16 and data.dtype != torch.float32:
209
+ data = data.to(torch.float32)
210
+
211
+ data = data.squeeze().numpy()
212
+
213
+ # map tensor names
214
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
215
+ if new_name is None:
216
+ print("Can not map tensor '" + name + "'")
217
+ sys.exit()
218
+
219
+ n_dims = len(data.shape)
220
+ data_dtype = data.dtype
221
+
222
+ # if f32 desired, convert any float16 to float32
223
+ if ftype == 0 and data_dtype == np.float16:
224
+ data = data.astype(np.float32)
225
+
226
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
227
+ if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
228
+ data = data.astype(np.float32)
229
+
230
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
231
+ if (
232
+ ftype == 1
233
+ and data_dtype == np.float32
234
+ and name.endswith(".weight")
235
+ and n_dims == 2
236
+ ):
237
+ data = data.astype(np.float16)
238
+
239
+ print(
240
+ new_name
241
+ + ", n_dims = "
242
+ + str(n_dims)
243
+ + ", "
244
+ + str(old_dtype)
245
+ + " --> "
246
+ + str(data.dtype)
247
+ )
248
+
249
+ gguf_writer.add_tensor(new_name, data)
250
+
251
+
252
+ print("gguf: write header")
253
+ gguf_writer.write_header_to_file()
254
+ print("gguf: write metadata")
255
+ gguf_writer.write_kv_data_to_file()
256
+ if not args.vocab_only:
257
+ print("gguf: write tensors")
258
+ gguf_writer.write_tensors_to_file()
259
+
260
+ gguf_writer.close()
261
+
262
+ print(f"gguf: model successfully exported to '{fname_out}'")
263
+ print("")
examples/CMakeLists.txt CHANGED
@@ -25,6 +25,7 @@ else()
25
  add_subdirectory(convert-llama2c-to-ggml)
26
  add_subdirectory(simple)
27
  add_subdirectory(batched)
 
28
  add_subdirectory(speculative)
29
  add_subdirectory(parallel)
30
  add_subdirectory(embd-input)
 
25
  add_subdirectory(convert-llama2c-to-ggml)
26
  add_subdirectory(simple)
27
  add_subdirectory(batched)
28
+ add_subdirectory(batched-bench)
29
  add_subdirectory(speculative)
30
  add_subdirectory(parallel)
31
  add_subdirectory(embd-input)
examples/batched-bench/CMakeLists.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ set(TARGET batched-bench)
2
+ add_executable(${TARGET} batched-bench.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
examples/batched-bench/README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llama.cpp/example/batched-bench
2
+
3
+ Benchmark the batched decoding performance of `llama.cpp`
4
+
5
+ ## Usage
6
+
7
+ There are 2 modes of operation:
8
+
9
+ - `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
10
+ - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
11
+
12
+ ```bash
13
+ ./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
14
+
15
+ # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
16
+ ./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
17
+
18
+ # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
19
+ ./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
20
+
21
+ # custom set of batches
22
+ ./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
23
+ ```
24
+
25
+ ## Sample results
26
+
27
+ - `PP` - prompt tokens per batch
28
+ - `TG` - generated tokens per batch
29
+ - `B` - number of batches
30
+ - `N_KV` - required KV cache size
31
+ - `T_PP` - prompt processing time (i.e. time to first token)
32
+ - `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
33
+ - `T_TG` - time to generate all batches
34
+ - `S_TG` - text generation speed (`(B*TG)/T_TG`)
35
+ - `T` - total time
36
+ - `S` - total speed (i.e. all tokens / total time)
37
+
38
+ | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
39
+ |-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
40
+ | 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 |
41
+ | 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 |
42
+ | 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 |
43
+ | 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 |
44
+ | 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 |
45
+ | 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 |
46
+ | 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 |
47
+ | 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 |
48
+ | 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 |
49
+ | 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
50
+ | 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
51
+ | 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
examples/batched-bench/batched-bench.cpp ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.h"
2
+ #include "llama.h"
3
+
4
+ #include <algorithm>
5
+ #include <cmath>
6
+ #include <cstdio>
7
+ #include <string>
8
+ #include <vector>
9
+
10
+ // mutates the input string
11
+ static std::vector<int> parse_list(char * p) {
12
+ std::vector<int> ret;
13
+
14
+ char * q = p;
15
+
16
+ while (*p) {
17
+ if (*p == ',') {
18
+ *p = '\0';
19
+ ret.push_back(std::atoi(q));
20
+ q = p + 1;
21
+ }
22
+
23
+ ++p;
24
+ }
25
+
26
+ ret.push_back(std::atoi(q));
27
+
28
+ return ret;
29
+ }
30
+
31
+ int main(int argc, char ** argv) {
32
+ gpt_params params;
33
+
34
+ if (argc == 1 || argv[1][0] == '-') {
35
+ printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
36
+ printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
37
+ printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
38
+ return 1 ;
39
+ }
40
+
41
+ int n_kv_max = 2048;
42
+ int is_pp_shared = 0;
43
+ int n_gpu_layers = 0;
44
+ int mmq = 0;
45
+
46
+ std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
47
+ std::vector<int> n_tg = { 128, 256, };
48
+ std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
49
+ //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
50
+
51
+ if (argc >= 2) {
52
+ params.model = argv[1];
53
+ }
54
+
55
+ if (argc >= 3) {
56
+ n_kv_max = std::atoi(argv[2]);
57
+ }
58
+
59
+ if (argc >= 4) {
60
+ is_pp_shared = std::atoi(argv[3]);
61
+ }
62
+
63
+ if (argc >= 5) {
64
+ n_gpu_layers = std::atoi(argv[4]);
65
+ }
66
+
67
+ if (argc >= 6) {
68
+ mmq = std::atoi(argv[5]);
69
+ }
70
+
71
+ if (argc >= 7) {
72
+ n_pp = parse_list(argv[6]);
73
+ }
74
+
75
+ if (argc >= 8) {
76
+ n_tg = parse_list(argv[7]);
77
+ }
78
+
79
+ if (argc >= 9) {
80
+ n_pl = parse_list(argv[8]);
81
+ }
82
+
83
+ // init LLM
84
+
85
+ llama_backend_init(params.numa);
86
+
87
+ // initialize the model
88
+
89
+ llama_model_params model_params = llama_model_default_params();
90
+
91
+ model_params.n_gpu_layers = n_gpu_layers;
92
+
93
+ llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
94
+
95
+ if (model == NULL) {
96
+ fprintf(stderr , "%s: error: unable to load model\n" , __func__);
97
+ return 1;
98
+ }
99
+
100
+ llama_context_params ctx_params = llama_context_default_params();
101
+
102
+ ctx_params.seed = 1234;
103
+ ctx_params.n_ctx = n_kv_max;
104
+ ctx_params.n_batch = 512;
105
+ ctx_params.mul_mat_q = mmq;
106
+
107
+ ctx_params.n_threads = params.n_threads;
108
+ ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
109
+
110
+ llama_context * ctx = llama_new_context_with_model(model, ctx_params);
111
+
112
+ if (ctx == NULL) {
113
+ fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
114
+ return 1;
115
+ }
116
+
117
+ llama_batch batch = llama_batch_init(n_kv_max, 0);
118
+
119
+ // decode in batches of ctx_params.n_batch tokens
120
+ auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
121
+ for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
122
+ const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
123
+
124
+ llama_batch batch_view = {
125
+ n_tokens,
126
+ batch.token + i,
127
+ nullptr,
128
+ batch.pos + i,
129
+ batch.seq_id + i,
130
+ batch.logits + i,
131
+ 0, 0, 0, // unused
132
+ };
133
+
134
+ const int ret = llama_decode(ctx, batch_view);
135
+ if (ret != 0) {
136
+ LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
137
+ return false;
138
+ }
139
+ }
140
+
141
+ return true;
142
+ };
143
+
144
+ // warm up
145
+ {
146
+ batch.n_tokens = 16;
147
+
148
+ for (int i = 0; i < batch.n_tokens; ++i) {
149
+ batch.token[i] = 0;
150
+ batch.pos[i] = i;
151
+ batch.seq_id[i] = 0;
152
+ batch.logits[i] = false;
153
+ }
154
+
155
+ if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
156
+ LOG_TEE("%s: llama_decode() failed\n", __func__);
157
+ return 1;
158
+ }
159
+ }
160
+
161
+ LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
162
+ LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
163
+
164
+ for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
165
+ for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
166
+ for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
167
+ const int pp = n_pp[i_pp];
168
+ const int tg = n_tg[i_tg];
169
+ const int pl = n_pl[i_pl];
170
+
171
+ const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
172
+
173
+ if (n_ctx_req > n_kv_max) {
174
+ continue;
175
+ }
176
+
177
+ batch.n_tokens = is_pp_shared ? pp : pl*pp;
178
+
179
+ for (int i = 0; i < batch.n_tokens; ++i) {
180
+ batch.token[i] = 0;
181
+ batch.pos[i] = i;
182
+ batch.seq_id[i] = 0;
183
+ batch.logits[i] = false;
184
+ }
185
+ batch.logits[batch.n_tokens - 1] = true;
186
+
187
+ const auto t_pp_start = ggml_time_us();
188
+
189
+ llama_kv_cache_tokens_rm(ctx, -1, -1);
190
+
191
+ if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
192
+ LOG_TEE("%s: llama_decode() failed\n", __func__);
193
+ return 1;
194
+ }
195
+
196
+ if (is_pp_shared) {
197
+ for (int32_t i = 1; i < pl; ++i) {
198
+ llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
199
+ }
200
+ }
201
+
202
+ const auto t_pp_end = ggml_time_us();
203
+
204
+ const auto t_tg_start = ggml_time_us();
205
+
206
+ for (int i = 0; i < tg; ++i) {
207
+ batch.n_tokens = pl;
208
+
209
+ for (int j = 0; j < pl; ++j) {
210
+ batch.token[j] = 0;
211
+ batch.pos[j] = pp + i;
212
+ batch.seq_id[j] = j;
213
+ batch.logits[j] = true;
214
+ }
215
+
216
+ if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
217
+ LOG_TEE("%s: llama_decode() failed\n", __func__);
218
+ return 1;
219
+ }
220
+ }
221
+
222
+ const auto t_tg_end = ggml_time_us();
223
+
224
+ const int32_t n_kv = n_ctx_req;
225
+
226
+ const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
227
+ const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
228
+ const float t = t_pp + t_tg;
229
+
230
+ const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
231
+ const float speed_tg = pl*tg / t_tg;
232
+ const float speed = n_kv / t;
233
+
234
+ LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
235
+ }
236
+ }
237
+ }
238
+
239
+ llama_print_timings(ctx);
240
+
241
+ llama_batch_free(batch);
242
+
243
+ llama_free(ctx);
244
+ llama_free_model(model);
245
+
246
+ llama_backend_free();
247
+
248
+ fprintf(stderr, "\n\n");
249
+
250
+ return 0;
251
+ }
examples/batched.swift/.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ /.build
3
+ /Packages
4
+ xcuserdata/
5
+ DerivedData/
6
+ .swiftpm/configuration/registries.json
7
+ .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
8
+ .netrc
9
+ batched_swift
examples/batched.swift/Makefile ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .PHONY: build
2
+
3
+ build:
4
+ xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
5
+ rm -f ./batched_swift
6
+ ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
examples/batched.swift/Package.swift ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // swift-tools-version: 5.5
2
+ // The swift-tools-version declares the minimum version of Swift required to build this package.
3
+
4
+ import PackageDescription
5
+
6
+ let package = Package(
7
+ name: "batched_swift",
8
+ platforms: [.macOS(.v12)],
9
+ dependencies: [
10
+ .package(name: "llama", path: "../../"),
11
+ ],
12
+ targets: [
13
+ // Targets are the basic building blocks of a package, defining a module or a test suite.
14
+ // Targets can depend on other targets in this package and products from dependencies.
15
+ .executableTarget(
16
+ name: "batched_swift",
17
+ dependencies: ["llama"],
18
+ path: "Sources",
19
+ linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
20
+ ),
21
+ ]
22
+ )
examples/batched.swift/README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ This is a swift clone of `examples/batched`.
2
+
3
+ $ `make`
4
+ $ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
examples/batched.swift/Sources/main.swift ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ import llama
3
+
4
+ let arguments = CommandLine.arguments
5
+
6
+ // Check that we have at least one argument (the model path)
7
+ guard arguments.count > 1 else {
8
+ print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
9
+ exit(1)
10
+ }
11
+
12
+ let modelPath: String = arguments[1]
13
+ let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
14
+ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
15
+
16
+ // total length of the sequences including the prompt
17
+ let n_len: Int = 32
18
+
19
+ // init LLM
20
+ llama_backend_init(false)
21
+ defer {
22
+ llama_backend_free()
23
+ }
24
+
25
+ let model_params = llama_model_default_params()
26
+ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
27
+ print("Failed to load model")
28
+ exit(1)
29
+ }
30
+
31
+ defer {
32
+ llama_free_model(model)
33
+ }
34
+
35
+ var tokens = tokenize(text: prompt, add_bos: true)
36
+
37
+ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
38
+
39
+ var context_params = llama_context_default_params()
40
+ context_params.seed = 1234
41
+ context_params.n_ctx = n_kv_req
42
+ context_params.n_batch = UInt32(max(n_len, n_parallel))
43
+ context_params.n_threads = 8
44
+ context_params.n_threads_batch = 8
45
+
46
+ let context = llama_new_context_with_model(model, context_params)
47
+ guard context != nil else {
48
+ print("Failed to initialize context")
49
+ exit(1)
50
+ }
51
+
52
+ defer {
53
+ llama_free(context)
54
+ }
55
+
56
+ let n_ctx = llama_n_ctx(context)
57
+
58
+ print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
59
+
60
+ if n_kv_req > n_ctx {
61
+ print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
62
+ exit(1)
63
+ }
64
+
65
+ var buffer: [CChar] = []
66
+ for id: llama_token in tokens {
67
+ print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
68
+ }
69
+
70
+ print("\n")
71
+
72
+ var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
73
+ defer {
74
+ llama_batch_free(batch)
75
+ }
76
+
77
+ // evaluate the initial prompt
78
+ batch.n_tokens = Int32(tokens.count)
79
+
80
+ for (i, token) in tokens.enumerated() {
81
+ batch.token[i] = token
82
+ batch.pos[i] = Int32(i)
83
+ batch.seq_id[i] = 0
84
+ batch.logits[i] = 0
85
+ }
86
+
87
+ // llama_decode will output logits only for the last token of the prompt
88
+ batch.logits[Int(batch.n_tokens) - 1] = 1
89
+
90
+ if llama_decode(context, batch) != 0 {
91
+ print("llama_decode() failed")
92
+ exit(1)
93
+ }
94
+
95
+ for i in 1 ..< n_parallel {
96
+ llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
97
+ }
98
+
99
+ if n_parallel > 1 {
100
+ print("generating \(n_parallel) sequences ...\n")
101
+ }
102
+
103
+ var streams: [String] = .init(repeating: "", count: n_parallel)
104
+ var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
105
+ var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
106
+
107
+ var n_cur = batch.n_tokens
108
+ var n_decode = 0
109
+
110
+ let t_main_start = ggml_time_us()
111
+
112
+ while n_cur <= n_len {
113
+ // prepare the next batch
114
+ batch.n_tokens = 0
115
+
116
+ // sample the next token for each parallel sequence / stream
117
+ for i in 0 ..< n_parallel {
118
+ if i_batch[i] < 0 {
119
+ // the stream has already finished
120
+ continue
121
+ }
122
+
123
+ var n_vocab = llama_n_vocab(model)
124
+ var logits = llama_get_logits_ith(context, i_batch[i])
125
+
126
+ var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
127
+
128
+ for token_id in 0 ..< n_vocab {
129
+ candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
130
+ }
131
+
132
+ var candidates_p: llama_token_data_array = .init(
133
+ data: &candidates,
134
+ size: candidates.count,
135
+ sorted: false
136
+ )
137
+
138
+ let top_k: Int32 = 40
139
+ let top_p: Float = 0.9
140
+ let temp: Float = 0.4
141
+
142
+ llama_sample_top_k(context, &candidates_p, top_k, 1)
143
+ llama_sample_top_p(context, &candidates_p, top_p, 1)
144
+ llama_sample_temp(context, &candidates_p, temp)
145
+
146
+ let new_token_id = llama_sample_token(context, &candidates_p)
147
+
148
+ // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
149
+
150
+ // is it an end of stream? -> mark the stream as finished
151
+ if new_token_id == llama_token_eos(context) || n_cur == n_len {
152
+ i_batch[i] = -1
153
+ // print("")
154
+ if n_parallel > 1 {
155
+ print("stream \(i) finished at n_cur = \(n_cur)")
156
+ }
157
+
158
+ continue
159
+ }
160
+
161
+ let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
162
+
163
+ // if there is only one stream, we print immediately to stdout
164
+ if n_parallel == 1 {
165
+ print(nextStringPiece, terminator: "")
166
+ }
167
+ streams[i] += nextStringPiece
168
+
169
+ // push this new token for next evaluation
170
+ batch.token[Int(batch.n_tokens)] = new_token_id
171
+ batch.pos[Int(batch.n_tokens)] = n_cur
172
+ batch.seq_id[Int(batch.n_tokens)] = Int32(i)
173
+ batch.logits[Int(batch.n_tokens)] = 1
174
+
175
+ i_batch[i] = batch.n_tokens
176
+
177
+ batch.n_tokens += 1
178
+
179
+ n_decode += 1
180
+ }
181
+
182
+ // all streams are finished
183
+ if batch.n_tokens == 0 {
184
+ break
185
+ }
186
+
187
+ n_cur += 1
188
+
189
+ // evaluate the current batch with the transformer model
190
+ if llama_decode(context, batch) != 0 {
191
+ print("llama_decode() failed")
192
+ exit(1)
193
+ }
194
+ }
195
+
196
+ if n_parallel > 1 {
197
+ print("\n")
198
+ for (i, stream) in streams.enumerated() {
199
+ print("sequence \(i):\n\n\(prompt)\(stream)\n")
200
+ }
201
+ }
202
+
203
+ let t_main_end = ggml_time_us()
204
+
205
+ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
206
+
207
+ llama_print_timings(context)
208
+
209
+ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
210
+ let n_tokens = text.count + (add_bos ? 1 : 0)
211
+ let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
212
+ let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
213
+ var swiftTokens: [llama_token] = []
214
+ for i in 0 ..< tokenCount {
215
+ swiftTokens.append(tokens[Int(i)])
216
+ }
217
+ tokens.deallocate()
218
+ return swiftTokens
219
+ }
220
+
221
+ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
222
+ var result = [CChar](repeating: 0, count: 8)
223
+ let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
224
+ if nTokens < 0 {
225
+ if result.count >= -Int(nTokens) {
226
+ result.removeLast(-Int(nTokens))
227
+ } else {
228
+ result.removeAll()
229
+ }
230
+ let check = llama_token_to_piece(
231
+ model,
232
+ token,
233
+ &result,
234
+ Int32(result.count)
235
+ )
236
+ assert(check == nTokens)
237
+ } else {
238
+ result.removeLast(result.count - Int(nTokens))
239
+ }
240
+ if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
241
+ return utfString
242
+ } else {
243
+ buffer.append(contentsOf: result)
244
+ let data = Data(buffer.map { UInt8(bitPattern: $0) })
245
+ if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
246
+ buffer = []
247
+ }
248
+ guard let bufferString = String(data: data, encoding: .utf8) else {
249
+ return nil
250
+ }
251
+ buffer = []
252
+ return bufferString
253
+ }
254
+ return nil
255
+ }
examples/batched/batched.cpp CHANGED
@@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
66
  ctx_params.seed = 1234;
67
  ctx_params.n_ctx = n_kv_req;
68
  ctx_params.n_batch = std::max(n_len, n_parallel);
69
- ctx_params.n_threads = params.n_threads;
70
  ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
71
 
72
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
66
  ctx_params.seed = 1234;
67
  ctx_params.n_ctx = n_kv_req;
68
  ctx_params.n_batch = std::max(n_len, n_parallel);
69
+ ctx_params.n_threads = params.n_threads;
70
  ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
71
 
72
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
examples/embd-input/embd-input-lib.cpp CHANGED
@@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){
128
  llama_token sampling_id(struct MyModel* mymodel) {
129
  llama_context* ctx = mymodel->ctx;
130
  gpt_params params = mymodel->params;
 
131
  // int n_ctx = llama_n_ctx(ctx);
132
 
133
  // out of user input, sample next token
134
- const float temp = params.temp;
135
- const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
136
- const float top_p = params.top_p;
137
- const float tfs_z = params.tfs_z;
138
- const float typical_p = params.typical_p;
139
  // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
140
  // const float repeat_penalty = params.repeat_penalty;
141
  // const float alpha_presence = params.presence_penalty;
142
  // const float alpha_frequency = params.frequency_penalty;
143
- const int mirostat = params.mirostat;
144
- const float mirostat_tau = params.mirostat_tau;
145
- const float mirostat_eta = params.mirostat_eta;
146
  // const bool penalize_nl = params.penalize_nl;
147
 
148
  llama_token id = 0;
@@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
151
  auto n_vocab = llama_n_vocab(llama_get_model(ctx));
152
 
153
  // Apply params.logit_bias map
154
- for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
155
  logits[it->first] += it->second;
156
  }
157
 
 
128
  llama_token sampling_id(struct MyModel* mymodel) {
129
  llama_context* ctx = mymodel->ctx;
130
  gpt_params params = mymodel->params;
131
+ llama_sampling_params & sparams = params.sampling_params;
132
  // int n_ctx = llama_n_ctx(ctx);
133
 
134
  // out of user input, sample next token
135
+ const float temp = sparams.temp;
136
+ const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
137
+ const float top_p = sparams.top_p;
138
+ const float tfs_z = sparams.tfs_z;
139
+ const float typical_p = sparams.typical_p;
140
  // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
141
  // const float repeat_penalty = params.repeat_penalty;
142
  // const float alpha_presence = params.presence_penalty;
143
  // const float alpha_frequency = params.frequency_penalty;
144
+ const int mirostat = sparams.mirostat;
145
+ const float mirostat_tau = sparams.mirostat_tau;
146
+ const float mirostat_eta = sparams.mirostat_eta;
147
  // const bool penalize_nl = params.penalize_nl;
148
 
149
  llama_token id = 0;
 
152
  auto n_vocab = llama_n_vocab(llama_get_model(ctx));
153
 
154
  // Apply params.logit_bias map
155
+ for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
156
  logits[it->first] += it->second;
157
  }
158
 
examples/infill/infill.cpp ADDED
@@ -0,0 +1,800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.h"
2
+
3
+ #include "console.h"
4
+ #include "llama.h"
5
+ #include "build-info.h"
6
+ #include "grammar-parser.h"
7
+
8
+ #include <cassert>
9
+ #include <cinttypes>
10
+ #include <cmath>
11
+ #include <cstdio>
12
+ #include <cstring>
13
+ #include <ctime>
14
+ #include <fstream>
15
+ #include <iostream>
16
+ #include <sstream>
17
+ #include <string>
18
+ #include <vector>
19
+
20
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
21
+ #include <signal.h>
22
+ #include <unistd.h>
23
+ #elif defined (_WIN32)
24
+ #define WIN32_LEAN_AND_MEAN
25
+ #ifndef NOMINMAX
26
+ #define NOMINMAX
27
+ #endif
28
+ #include <windows.h>
29
+ #include <signal.h>
30
+ #endif
31
+
32
+ #if defined(_MSC_VER)
33
+ #pragma warning(disable: 4244 4267) // possible loss of data
34
+ #endif
35
+
36
+ static llama_context ** g_ctx;
37
+ static llama_model ** g_model;
38
+ static gpt_params * g_params;
39
+ static std::vector<llama_token> * g_input_tokens;
40
+ static std::ostringstream * g_output_ss;
41
+ static std::vector<llama_token> * g_output_tokens;
42
+ static bool is_interacting = false;
43
+
44
+
45
+ static void write_logfile(
46
+ const llama_context * ctx, const gpt_params & params, const llama_model * model,
47
+ const std::vector<llama_token> & input_tokens, const std::string & output,
48
+ const std::vector<llama_token> & output_tokens
49
+ ) {
50
+ if (params.logdir.empty()) {
51
+ return;
52
+ }
53
+
54
+ const std::string timestamp = get_sortable_timestamp();
55
+
56
+ const bool success = create_directory_with_parents(params.logdir);
57
+ if (!success) {
58
+ fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
59
+ __func__, params.logdir.c_str());
60
+ return;
61
+ }
62
+
63
+ const std::string logfile_path = params.logdir + timestamp + ".yml";
64
+ FILE * logfile = fopen(logfile_path.c_str(), "w");
65
+
66
+ if (logfile == NULL) {
67
+ fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
68
+ return;
69
+ }
70
+
71
+ fprintf(logfile, "binary: infill\n");
72
+ char model_desc[128];
73
+ llama_model_desc(model, model_desc, sizeof(model_desc));
74
+ dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
75
+
76
+ fprintf(logfile, "\n");
77
+ fprintf(logfile, "######################\n");
78
+ fprintf(logfile, "# Generation Results #\n");
79
+ fprintf(logfile, "######################\n");
80
+ fprintf(logfile, "\n");
81
+
82
+ dump_string_yaml_multiline(logfile, "output", output.c_str());
83
+ dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
84
+
85
+ llama_dump_timing_info_yaml(logfile, ctx);
86
+ fclose(logfile);
87
+ }
88
+
89
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
90
+ static void sigint_handler(int signo) {
91
+ if (signo == SIGINT) {
92
+ if (!is_interacting) {
93
+ is_interacting = true;
94
+ } else {
95
+ console::cleanup();
96
+ printf("\n");
97
+ llama_print_timings(*g_ctx);
98
+ write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
99
+ _exit(130);
100
+ }
101
+ }
102
+ }
103
+ #endif
104
+
105
+ int main(int argc, char ** argv) {
106
+ gpt_params params;
107
+ llama_sampling_params & sparams = params.sampling_params;
108
+ g_params = &params;
109
+
110
+ if (!gpt_params_parse(argc, argv, params)) {
111
+ return 1;
112
+ }
113
+
114
+ #ifndef LOG_DISABLE_LOGS
115
+ log_set_target(log_filename_generator("infill", "log"));
116
+ LOG_TEE("Log start\n");
117
+ log_dump_cmdline(argc, argv);
118
+ #endif // LOG_DISABLE_LOGS
119
+
120
+ console::init(params.simple_io, params.use_color);
121
+ atexit([]() { console::cleanup(); });
122
+
123
+ if (params.logits_all) {
124
+ printf("\n************\n");
125
+ printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
126
+ printf("************\n\n");
127
+
128
+ return 0;
129
+ }
130
+
131
+ if (params.embedding) {
132
+ printf("\n************\n");
133
+ printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
134
+ printf("************\n\n");
135
+
136
+ return 0;
137
+ }
138
+
139
+ if (params.n_ctx != 0 && params.n_ctx < 8) {
140
+ LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
141
+ params.n_ctx = 8;
142
+ }
143
+ if (params.instruct) {
144
+ printf("\n************\n");
145
+ printf("%s: please use the 'main' tool for instruct mode\n", __func__);
146
+ printf("************\n\n");
147
+
148
+ return 0;
149
+ }
150
+ if (!params.antiprompt.empty()) {
151
+ printf("\n************\n");
152
+ printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
153
+ printf("************\n\n");
154
+
155
+ return 0;
156
+ }
157
+ if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
158
+ printf("\n************\n");
159
+ printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
160
+ printf("************\n\n");
161
+
162
+ return 0;
163
+ }
164
+ if (params.random_prompt) {
165
+ printf("\n************\n");
166
+ printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
167
+ printf("************\n\n");
168
+
169
+ return 0;
170
+ }
171
+ if (!params.path_prompt_cache.empty()) {
172
+ printf("\n************\n");
173
+ printf("%s: infill does not support prompt caching\n", __func__);
174
+ printf("************\n\n");
175
+
176
+ return 0;
177
+ }
178
+
179
+ if (params.rope_freq_base != 0.0) {
180
+ LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
181
+ }
182
+
183
+ if (params.rope_freq_scale != 0.0) {
184
+ LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
185
+ }
186
+
187
+ LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
188
+ LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
189
+
190
+ if (params.seed == LLAMA_DEFAULT_SEED) {
191
+ params.seed = time(NULL);
192
+ }
193
+
194
+ LOG_TEE("%s: seed = %u\n", __func__, params.seed);
195
+
196
+ std::mt19937 rng(params.seed);
197
+
198
+ LOG("%s: llama backend init\n", __func__);
199
+ llama_backend_init(params.numa);
200
+
201
+ llama_model * model;
202
+ llama_context * ctx;
203
+ llama_context * ctx_guidance = NULL;
204
+ g_model = &model;
205
+ g_ctx = &ctx;
206
+
207
+ // load the model and apply lora adapter, if any
208
+ LOG("%s: load the model and apply lora adapter, if any\n", __func__);
209
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
210
+ if (sparams.cfg_scale > 1.f) {
211
+ struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
212
+ ctx_guidance = llama_new_context_with_model(model, lparams);
213
+ }
214
+
215
+ if (model == NULL) {
216
+ LOG_TEE("%s: error: unable to load model\n", __func__);
217
+ return 1;
218
+ }
219
+
220
+ const int n_ctx_train = llama_n_ctx_train(model);
221
+ const int n_ctx = llama_n_ctx(ctx);
222
+ LOG("n_ctx: %d\n", n_ctx);
223
+
224
+ if (n_ctx > n_ctx_train) {
225
+ LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
226
+ __func__, n_ctx_train, n_ctx);
227
+ }
228
+
229
+ // print system information
230
+ {
231
+ LOG_TEE("\n");
232
+ LOG_TEE("%s\n", get_system_info(params).c_str());
233
+ }
234
+ const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
235
+ LOG("add_bos: %d\n", add_bos);
236
+
237
+ bool suff_rm_leading_spc = params.escape;
238
+ if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
239
+ params.input_suffix.erase(0, 1);
240
+ suff_rm_leading_spc = false;
241
+ }
242
+ std::vector<llama_token> embd_inp;
243
+ std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
244
+ std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
245
+ const int space_token = 29871;
246
+ if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
247
+ inp_sfx.erase(inp_sfx.begin());
248
+ }
249
+ inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
250
+ if (add_bos) {
251
+ inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
252
+ }
253
+ inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
254
+ embd_inp = inp_pfx;
255
+ embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
256
+ embd_inp.push_back(llama_token_middle(ctx));
257
+
258
+ LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
259
+ LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
260
+ LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
261
+
262
+ // Should not run without any tokens
263
+ if (embd_inp.empty()) {
264
+ embd_inp.push_back(llama_token_bos(ctx));
265
+ LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
266
+ }
267
+
268
+ // Tokenize negative prompt
269
+ std::vector<llama_token> guidance_inp;
270
+ int guidance_offset = 0;
271
+ int original_prompt_len = 0;
272
+ if (ctx_guidance) {
273
+ LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
274
+
275
+ guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
276
+ LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
277
+
278
+ std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
279
+ LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
280
+
281
+ original_prompt_len = original_inp.size();
282
+ guidance_offset = (int)guidance_inp.size() - original_prompt_len;
283
+ LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
284
+ LOG("guidance_offset: %s", log_tostr(guidance_offset));
285
+ }
286
+
287
+ if ((int) embd_inp.size() > n_ctx - 4) {
288
+ LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
289
+ return 1;
290
+ }
291
+
292
+ // number of tokens to keep when resetting context
293
+ if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
294
+ params.n_keep = (int)embd_inp.size();
295
+ }
296
+
297
+ LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
298
+ LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
299
+
300
+
301
+ // enable interactive mode if interactive start is specified
302
+ if (params.interactive_first) {
303
+ params.interactive = true;
304
+ }
305
+
306
+ if (params.verbose_prompt) {
307
+ LOG_TEE("\n");
308
+ LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
309
+ LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
310
+ for (int i = 0; i < (int) embd_inp.size(); i++) {
311
+ LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
312
+ }
313
+
314
+ if (ctx_guidance) {
315
+ LOG_TEE("\n");
316
+ LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
317
+ LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
318
+ for (int i = 0; i < (int) guidance_inp.size(); i++) {
319
+ LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
320
+ }
321
+ }
322
+
323
+ if (params.n_keep > 0) {
324
+ LOG_TEE("%s: static prompt based on n_keep: '", __func__);
325
+ for (int i = 0; i < params.n_keep; i++) {
326
+ LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
327
+ }
328
+ LOG_TEE("'\n");
329
+ }
330
+ LOG_TEE("\n");
331
+ }
332
+
333
+ if (params.interactive) {
334
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
335
+ struct sigaction sigint_action;
336
+ sigint_action.sa_handler = sigint_handler;
337
+ sigemptyset (&sigint_action.sa_mask);
338
+ sigint_action.sa_flags = 0;
339
+ sigaction(SIGINT, &sigint_action, NULL);
340
+ #elif defined (_WIN32)
341
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
342
+ return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
343
+ };
344
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
345
+ #endif
346
+
347
+ LOG_TEE("%s: interactive mode on.\n", __func__);
348
+
349
+ if (params.input_prefix_bos) {
350
+ LOG_TEE("Input prefix with BOS\n");
351
+ }
352
+
353
+ if (!params.input_prefix.empty()) {
354
+ LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
355
+ }
356
+
357
+ if (!params.input_suffix.empty()) {
358
+ LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
359
+ }
360
+ }
361
+ LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
362
+ sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
363
+ LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
364
+ LOG_TEE("\n\n");
365
+
366
+ struct llama_grammar * grammar = NULL;
367
+ grammar_parser::parse_state parsed_grammar;
368
+
369
+ if (!params.grammar.empty()) {
370
+ parsed_grammar = grammar_parser::parse(params.grammar.c_str());
371
+ // will be empty (default) if there are parse errors
372
+ if (parsed_grammar.rules.empty()) {
373
+ return 1;
374
+ }
375
+ LOG_TEE("%s: grammar:\n", __func__);
376
+ grammar_parser::print_grammar(stderr, parsed_grammar);
377
+ LOG_TEE("\n");
378
+
379
+ {
380
+ auto it = sparams.logit_bias.find(llama_token_eos(ctx));
381
+ if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
382
+ LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
383
+ }
384
+ }
385
+
386
+ std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
387
+ grammar = llama_grammar_init(
388
+ grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
389
+ }
390
+
391
+ // TODO: replace with ring-buffer
392
+ std::vector<llama_token> last_tokens(n_ctx);
393
+ std::fill(last_tokens.begin(), last_tokens.end(), 0);
394
+ LOG_TEE("\n##### Infill mode #####\n\n");
395
+ if (params.infill) {
396
+ printf("\n************\n");
397
+ printf("no need to specify '--infill', always running infill\n");
398
+ printf("************\n\n");
399
+ }
400
+ if (params.interactive) {
401
+ const char *control_message;
402
+ if (params.multiline_input) {
403
+ control_message = " - To return control to LLaMa, end your input with '\\'.\n"
404
+ " - To return control without starting a new line, end your input with '/'.\n";
405
+ } else {
406
+ control_message = " - Press Return to return control to LLaMa.\n"
407
+ " - To return control without starting a new line, end your input with '/'.\n"
408
+ " - If you want to submit another line, end your input with '\\'.\n";
409
+ }
410
+ LOG_TEE("== Running in interactive mode. ==\n");
411
+ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
412
+ LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
413
+ #endif
414
+ LOG_TEE( "%s\n", control_message);
415
+
416
+ is_interacting = params.interactive_first;
417
+ }
418
+
419
+ bool input_echo = true;
420
+
421
+ int n_past = 0;
422
+ int n_remain = params.n_predict;
423
+ int n_consumed = 0;
424
+ int n_past_guidance = 0;
425
+
426
+ std::vector<int> input_tokens; g_input_tokens = &input_tokens;
427
+ std::vector<int> output_tokens; g_output_tokens = &output_tokens;
428
+ std::ostringstream output_ss; g_output_ss = &output_ss;
429
+
430
+ // the first thing we will do is to output the prompt, so set color accordingly
431
+ console::set_display(console::prompt);
432
+
433
+ std::vector<llama_token> embd;
434
+ std::vector<llama_token> embd_guidance;
435
+
436
+ const int n_vocab = llama_n_vocab(model);
437
+
438
+ llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
439
+ std::vector<llama_token_data> candidates;
440
+ candidates.reserve(n_vocab);
441
+
442
+ while (n_remain != 0 || params.interactive) {
443
+ // predict
444
+ if (!embd.empty()) {
445
+ // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
446
+ // --prompt or --file which uses the same value.
447
+ int max_embd_size = n_ctx - 4;
448
+
449
+ // Ensure the input doesn't exceed the context size by truncating embd if necessary.
450
+ if ((int) embd.size() > max_embd_size) {
451
+ const int skipped_tokens = (int) embd.size() - max_embd_size;
452
+ embd.resize(max_embd_size);
453
+
454
+ console::set_display(console::error);
455
+ printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
456
+ console::set_display(console::reset);
457
+ fflush(stdout);
458
+ }
459
+
460
+ // infinite text generation via context swapping
461
+ // if we run out of context:
462
+ // - take the n_keep first tokens from the original prompt (via n_past)
463
+ // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
464
+ if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
465
+ if (params.n_predict == -2) {
466
+ LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
467
+ break;
468
+ }
469
+
470
+ const int n_left = n_past - params.n_keep - 1;
471
+ const int n_discard = n_left/2;
472
+
473
+ LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
474
+ n_past, n_left, n_ctx, params.n_keep, n_discard);
475
+
476
+ llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
477
+ llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
478
+
479
+ n_past -= n_discard;
480
+
481
+ if (ctx_guidance) {
482
+ n_past_guidance -= n_discard;
483
+ }
484
+
485
+ LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
486
+
487
+ LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
488
+
489
+ }
490
+
491
+ // evaluate tokens in batches
492
+ // embd is typically prepared beforehand to fit within a batch, but not always
493
+
494
+ if (ctx_guidance) {
495
+ int input_size = 0;
496
+ llama_token * input_buf = NULL;
497
+
498
+ if (n_past_guidance < (int) guidance_inp.size()) {
499
+ // Guidance context should have the same data with these modifications:
500
+ //
501
+ // * Replace the initial prompt
502
+ // * Shift everything by guidance_offset
503
+ embd_guidance = guidance_inp;
504
+ if (embd.begin() + original_prompt_len < embd.end()) {
505
+ embd_guidance.insert(
506
+ embd_guidance.end(),
507
+ embd.begin() + original_prompt_len,
508
+ embd.end()
509
+ );
510
+ }
511
+
512
+ input_buf = embd_guidance.data();
513
+ input_size = embd_guidance.size();
514
+
515
+ LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
516
+ } else {
517
+ input_buf = embd.data();
518
+ input_size = embd.size();
519
+ }
520
+
521
+ for (int i = 0; i < input_size; i += params.n_batch) {
522
+ int n_eval = std::min(input_size - i, params.n_batch);
523
+ if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
524
+ LOG_TEE("%s : failed to eval\n", __func__);
525
+ return 1;
526
+ }
527
+
528
+ n_past_guidance += n_eval;
529
+ }
530
+ }
531
+
532
+ for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
533
+ int n_eval = (int) embd.size() - i;
534
+ if (n_eval > params.n_batch) {
535
+ n_eval = params.n_batch;
536
+ }
537
+
538
+ LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
539
+
540
+ if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
541
+ LOG_TEE("%s : failed to eval\n", __func__);
542
+ return 1;
543
+ }
544
+
545
+ n_past += n_eval;
546
+
547
+ LOG("n_past = %d\n", n_past);
548
+ }
549
+
550
+ }
551
+
552
+ embd.clear();
553
+ embd_guidance.clear();
554
+
555
+ if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
556
+
557
+ const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
558
+
559
+ last_tokens.erase(last_tokens.begin());
560
+ last_tokens.push_back(id);
561
+
562
+ LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
563
+
564
+ embd.push_back(id);
565
+
566
+ // echo this to console
567
+ input_echo = true;
568
+
569
+ // decrement remaining sampling budget
570
+ --n_remain;
571
+
572
+ LOG("n_remain: %d\n", n_remain);
573
+ } else {
574
+ // some user input remains from prompt or interaction, forward it to processing
575
+ LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
576
+ while ((int) embd_inp.size() > n_consumed) {
577
+ embd.push_back(embd_inp[n_consumed]);
578
+ last_tokens.erase(last_tokens.begin());
579
+ last_tokens.push_back(embd_inp[n_consumed]);
580
+ ++n_consumed;
581
+ if ((int) embd.size() >= params.n_batch) {
582
+ break;
583
+ }
584
+ }
585
+ }
586
+
587
+ // display text
588
+ if (input_echo) {
589
+ for (auto id : embd) {
590
+ const std::string token_str = llama_token_to_piece(ctx, id);
591
+ printf("%s", token_str.c_str());
592
+
593
+ if (embd.size() > 1) {
594
+ input_tokens.push_back(id);
595
+ } else {
596
+ output_tokens.push_back(id);
597
+ output_ss << token_str;
598
+ }
599
+ }
600
+ fflush(stdout);
601
+ }
602
+ // reset color to default if we there is no pending user input
603
+ if (input_echo && (int) embd_inp.size() == n_consumed) {
604
+ console::set_display(console::reset);
605
+ }
606
+
607
+ // if not currently processing queued inputs;
608
+ if ((int) embd_inp.size() <= n_consumed) {
609
+
610
+ // deal with eot token in infill mode
611
+ if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
612
+ if(is_interacting && !params.interactive_first) {
613
+ // print an eot token
614
+ printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
615
+ }
616
+ fflush(stdout);
617
+ printf("\n");
618
+ console::set_display(console::user_input);
619
+ std::string buffer;
620
+ std::string line;
621
+ bool another_line=true;
622
+ // set a new prefix via stdin
623
+ do {
624
+ another_line = console::readline(line, params.multiline_input);
625
+ buffer += line;
626
+ } while (another_line);
627
+ // check if we got an empty line, if so we use the old input
628
+ if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
629
+ params.input_prefix = buffer;
630
+ }
631
+ buffer.clear();
632
+ // set a new suffix via stdin
633
+ do {
634
+ another_line = console::readline(line, params.multiline_input);
635
+ buffer += line;
636
+ } while (another_line);
637
+ // check if we got an empty line
638
+ if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
639
+ params.input_suffix = buffer;
640
+ }
641
+ buffer.clear();
642
+ // done taking input, reset color
643
+ console::set_display(console::reset);
644
+
645
+ if (params.escape) {
646
+ //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
647
+ process_escapes(params.input_prefix);
648
+ process_escapes(params.input_suffix);
649
+ }
650
+ suff_rm_leading_spc = params.escape;
651
+ if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
652
+ params.input_suffix.erase(0, 1);
653
+ suff_rm_leading_spc = false;
654
+ }
655
+ // tokenize new prefix and suffix
656
+ std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
657
+ std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
658
+ if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
659
+ inp_sfx.erase(inp_sfx.begin());
660
+ }
661
+ inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
662
+ if (add_bos) {
663
+ inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
664
+ }
665
+ inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
666
+ embd_inp = inp_pfx;
667
+ embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
668
+ embd_inp.push_back(llama_token_middle(ctx));
669
+ embd.clear();
670
+ embd_guidance.clear();
671
+ n_remain = params.n_predict;
672
+ n_past = 0;
673
+ n_consumed = 0;
674
+ // LOG_TEE("took new input\n");
675
+ is_interacting = false;
676
+ }
677
+ // deal with end of text token in interactive mode
678
+ else if (last_tokens.back() == llama_token_eos(ctx)) {
679
+ LOG("found EOS token\n");
680
+
681
+ if (params.interactive) {
682
+
683
+ is_interacting = true;
684
+ printf("\n");
685
+ console::set_display(console::user_input);
686
+ fflush(stdout);
687
+ }
688
+ }
689
+
690
+ if (n_past > 0 && is_interacting && !params.interactive) {
691
+ LOG("waiting for user input\n");
692
+
693
+ if (params.input_prefix_bos) {
694
+ LOG("adding input prefix BOS token\n");
695
+ embd_inp.push_back(llama_token_bos(ctx));
696
+ }
697
+
698
+ std::string buffer;
699
+ if (!params.input_prefix.empty()) {
700
+ LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
701
+ buffer += params.input_prefix;
702
+ printf("%s", buffer.c_str());
703
+ }
704
+
705
+ std::string line;
706
+ bool another_line = true;
707
+ do {
708
+ another_line = console::readline(line, params.multiline_input);
709
+ buffer += line;
710
+ } while (another_line);
711
+
712
+ // done taking input, reset color
713
+ console::set_display(console::reset);
714
+
715
+ // Add tokens to embd only if the input buffer is non-empty
716
+ // Entering a empty line lets the user pass control back
717
+ if (buffer.length() > 1) {
718
+ // append input suffix if any
719
+ if (!params.input_suffix.empty()) {
720
+ LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
721
+ buffer += params.input_suffix;
722
+ printf("%s", params.input_suffix.c_str());
723
+ }
724
+
725
+ LOG("buffer: '%s'\n", buffer.c_str());
726
+
727
+ const size_t original_size = embd_inp.size();
728
+
729
+ const auto line_inp = ::llama_tokenize(ctx, buffer, false);
730
+ LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
731
+
732
+ embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
733
+
734
+ for (size_t i = original_size; i < embd_inp.size(); ++i) {
735
+ const llama_token token = embd_inp[i];
736
+ output_tokens.push_back(token);
737
+ output_ss << llama_token_to_piece(ctx, token);
738
+ }
739
+
740
+ n_remain -= line_inp.size();
741
+ LOG("n_remain: %d\n", n_remain);
742
+ } else {
743
+ LOG("empty line, passing control back\n");
744
+ }
745
+
746
+ input_echo = false; // do not echo this again
747
+ }
748
+
749
+ if (n_past > 0) {
750
+ if (is_interacting) {
751
+ // reset grammar state if we're restarting generation
752
+ if (grammar != NULL) {
753
+ llama_grammar_free(grammar);
754
+
755
+ std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
756
+ grammar = llama_grammar_init(
757
+ grammar_rules.data(), grammar_rules.size(),
758
+ parsed_grammar.symbol_ids.at("root"));
759
+ }
760
+ }
761
+ is_interacting = false;
762
+ }
763
+ }
764
+
765
+ // end of text token
766
+ if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
767
+ break;
768
+ }
769
+
770
+ // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
771
+ // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
772
+ if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
773
+ n_remain = params.n_predict;
774
+ is_interacting = true;
775
+ }
776
+ }
777
+ if (!params.interactive && n_remain <= 0) {
778
+ printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
779
+ fflush(stdout);
780
+ }
781
+
782
+ llama_print_timings(ctx);
783
+ write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
784
+
785
+ if (ctx_guidance) { llama_free(ctx_guidance); }
786
+ llama_free(ctx);
787
+ llama_free_model(model);
788
+
789
+ if (grammar != NULL) {
790
+ llama_grammar_free(grammar);
791
+ }
792
+ llama_backend_free();
793
+
794
+ #ifndef LOG_DISABLE_LOGS
795
+ LOG_TEE("Log end\n");
796
+ #endif // LOG_DISABLE_LOGS
797
+
798
+ return 0;
799
+ }
800
+
examples/main/main.cpp CHANGED
@@ -109,6 +109,7 @@ int main(int argc, char ** argv) {
109
  if (!gpt_params_parse(argc, argv, params)) {
110
  return 1;
111
  }
 
112
 
113
  #ifndef LOG_DISABLE_LOGS
114
  log_set_target(log_filename_generator("main", "log"));
@@ -179,7 +180,7 @@ int main(int argc, char ** argv) {
179
  // load the model and apply lora adapter, if any
180
  LOG("%s: load the model and apply lora adapter, if any\n", __func__);
181
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
182
- if (params.cfg_scale > 1.f) {
183
  struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
184
  ctx_guidance = llama_new_context_with_model(model, lparams);
185
  }
@@ -257,9 +258,9 @@ int main(int argc, char ** argv) {
257
  int guidance_offset = 0;
258
  int original_prompt_len = 0;
259
  if (ctx_guidance) {
260
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
261
 
262
- guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
263
  LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
264
 
265
  std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@@ -296,6 +297,9 @@ int main(int argc, char ** argv) {
296
  LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
297
  __func__, n_matching_session_tokens, embd_inp.size());
298
  }
 
 
 
299
  }
300
 
301
  LOGLN(
@@ -343,7 +347,7 @@ int main(int argc, char ** argv) {
343
 
344
  if (ctx_guidance) {
345
  LOG_TEE("\n");
346
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
347
  LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
348
  for (int i = 0; i < (int) guidance_inp.size(); i++) {
349
  LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -395,7 +399,7 @@ int main(int argc, char ** argv) {
395
  }
396
  }
397
  LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
398
- params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
399
  LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
400
  LOG_TEE("\n\n");
401
 
@@ -413,8 +417,8 @@ int main(int argc, char ** argv) {
413
  LOG_TEE("\n");
414
 
415
  {
416
- auto it = params.logit_bias.find(llama_token_eos(ctx));
417
- if (it != params.logit_bias.end() && it->second == -INFINITY) {
418
  LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
419
  }
420
  }
@@ -469,6 +473,7 @@ int main(int argc, char ** argv) {
469
 
470
  const int n_vocab = llama_n_vocab(model);
471
 
 
472
  std::vector<llama_token_data> candidates;
473
  candidates.reserve(n_vocab);
474
 
@@ -622,7 +627,7 @@ int main(int argc, char ** argv) {
622
  LOG("saved session to %s\n", path_session.c_str());
623
  }
624
 
625
- const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
626
 
627
  last_tokens.erase(last_tokens.begin());
628
  last_tokens.push_back(id);
@@ -667,7 +672,7 @@ int main(int argc, char ** argv) {
667
  }
668
  fflush(stdout);
669
  }
670
- // reset color to default if we there is no pending user input
671
  if (input_echo && (int) embd_inp.size() == n_consumed) {
672
  console::set_display(console::reset);
673
  }
@@ -694,10 +699,8 @@ int main(int argc, char ** argv) {
694
  if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
695
  if (params.interactive) {
696
  is_interacting = true;
697
- console::set_display(console::user_input);
698
  }
699
  is_antiprompt = true;
700
- fflush(stdout);
701
  break;
702
  }
703
  }
@@ -721,8 +724,6 @@ int main(int argc, char ** argv) {
721
 
722
  is_interacting = true;
723
  printf("\n");
724
- console::set_display(console::user_input);
725
- fflush(stdout);
726
  } else if (params.instruct) {
727
  is_interacting = true;
728
  }
@@ -747,6 +748,9 @@ int main(int argc, char ** argv) {
747
  printf("%s", buffer.c_str());
748
  }
749
 
 
 
 
750
  std::string line;
751
  bool another_line = true;
752
  do {
 
109
  if (!gpt_params_parse(argc, argv, params)) {
110
  return 1;
111
  }
112
+ llama_sampling_params & sparams = params.sampling_params;
113
 
114
  #ifndef LOG_DISABLE_LOGS
115
  log_set_target(log_filename_generator("main", "log"));
 
180
  // load the model and apply lora adapter, if any
181
  LOG("%s: load the model and apply lora adapter, if any\n", __func__);
182
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
183
+ if (sparams.cfg_scale > 1.f) {
184
  struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
185
  ctx_guidance = llama_new_context_with_model(model, lparams);
186
  }
 
258
  int guidance_offset = 0;
259
  int original_prompt_len = 0;
260
  if (ctx_guidance) {
261
+ LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
262
 
263
+ guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
264
  LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
265
 
266
  std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
 
297
  LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
298
  __func__, n_matching_session_tokens, embd_inp.size());
299
  }
300
+
301
+ // remove any "future" tokens that we might have inherited from the previous session
302
+ llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
303
  }
304
 
305
  LOGLN(
 
347
 
348
  if (ctx_guidance) {
349
  LOG_TEE("\n");
350
+ LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
351
  LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
352
  for (int i = 0; i < (int) guidance_inp.size(); i++) {
353
  LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
 
399
  }
400
  }
401
  LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
402
+ sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
403
  LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
404
  LOG_TEE("\n\n");
405
 
 
417
  LOG_TEE("\n");
418
 
419
  {
420
+ auto it = sparams.logit_bias.find(llama_token_eos(ctx));
421
+ if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
422
  LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
423
  }
424
  }
 
473
 
474
  const int n_vocab = llama_n_vocab(model);
475
 
476
+ llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
477
  std::vector<llama_token_data> candidates;
478
  candidates.reserve(n_vocab);
479
 
 
627
  LOG("saved session to %s\n", path_session.c_str());
628
  }
629
 
630
+ const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
631
 
632
  last_tokens.erase(last_tokens.begin());
633
  last_tokens.push_back(id);
 
672
  }
673
  fflush(stdout);
674
  }
675
+ // reset color to default if there is no pending user input
676
  if (input_echo && (int) embd_inp.size() == n_consumed) {
677
  console::set_display(console::reset);
678
  }
 
699
  if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
700
  if (params.interactive) {
701
  is_interacting = true;
 
702
  }
703
  is_antiprompt = true;
 
704
  break;
705
  }
706
  }
 
724
 
725
  is_interacting = true;
726
  printf("\n");
 
 
727
  } else if (params.instruct) {
728
  is_interacting = true;
729
  }
 
748
  printf("%s", buffer.c_str());
749
  }
750
 
751
+ // color user input only
752
+ console::set_display(console::user_input);
753
+
754
  std::string line;
755
  bool another_line = true;
756
  do {
examples/parallel/parallel.cpp CHANGED
@@ -10,6 +10,7 @@
10
  #include <cstdio>
11
  #include <string>
12
  #include <vector>
 
13
 
14
  // trim whitespace from the beginning and end of a string
15
  static std::string trim(const std::string & str) {
@@ -70,6 +71,26 @@ struct client {
70
  std::vector<llama_token> tokens_prev;
71
  };
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  int main(int argc, char ** argv) {
74
  srand(1234);
75
 
@@ -104,6 +125,25 @@ int main(int argc, char ** argv) {
104
  params.logits_all = true;
105
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  fprintf(stderr, "\n\n");
108
  fflush(stderr);
109
 
@@ -129,7 +169,7 @@ int main(int argc, char ** argv) {
129
 
130
  // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
131
  // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
132
- llama_batch batch = llama_batch_init(params.n_ctx, 0);
133
 
134
  int32_t n_total_prompt = 0;
135
  int32_t n_total_gen = 0;
@@ -233,7 +273,7 @@ int main(int argc, char ** argv) {
233
  client.n_decoded = 0;
234
  client.i_batch = batch.n_tokens - 1;
235
 
236
- LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
237
 
238
  g_seq_id += 1;
239
 
@@ -301,7 +341,7 @@ int main(int argc, char ** argv) {
301
  //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
302
  // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
303
 
304
- const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
305
 
306
  if (client.n_decoded == 1) {
307
  // start measuring generation time after the first token to make sure all concurrent clients
@@ -332,12 +372,12 @@ int main(int argc, char ** argv) {
332
  }
333
 
334
  // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
335
- llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
336
 
337
  const auto t_main_end = ggml_time_us();
338
 
339
- LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
340
- client.id, client.seq_id, client.n_prompt, client.n_decoded,
341
  (t_main_end - client.t_start_prompt) / 1e6,
342
  (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
343
  n_cache_miss,
@@ -346,7 +386,7 @@ int main(int argc, char ** argv) {
346
 
347
  n_total_prompt += client.n_prompt;
348
  n_total_gen += client.n_decoded;
349
-
350
  client.seq_id = -1;
351
  }
352
 
@@ -357,13 +397,21 @@ int main(int argc, char ** argv) {
357
 
358
  const auto t_main_end = ggml_time_us();
359
 
360
- LOG_TEE("\n\n");
 
 
 
 
 
 
 
 
361
  LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
362
  LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
363
  LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
364
  LOG_TEE("Cache misses: %6d\n", n_cache_miss);
365
 
366
- LOG_TEE("\n\n");
367
 
368
  llama_print_timings(ctx);
369
 
 
10
  #include <cstdio>
11
  #include <string>
12
  #include <vector>
13
+ #include <ctime>
14
 
15
  // trim whitespace from the beginning and end of a string
16
  static std::string trim(const std::string & str) {
 
71
  std::vector<llama_token> tokens_prev;
72
  };
73
 
74
+ static void print_date_time() {
75
+ std::time_t current_time = std::time(nullptr);
76
+ std::tm* local_time = std::localtime(&current_time);
77
+ char buffer[80];
78
+ strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
79
+
80
+ printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
81
+ }
82
+
83
+ // Define a split string function to ...
84
+ static std::vector<std::string> split_string(const std::string& input, char delimiter) {
85
+ std::vector<std::string> tokens;
86
+ std::istringstream stream(input);
87
+ std::string token;
88
+ while (std::getline(stream, token, delimiter)) {
89
+ tokens.push_back(token);
90
+ }
91
+ return tokens;
92
+ }
93
+
94
  int main(int argc, char ** argv) {
95
  srand(1234);
96
 
 
125
  params.logits_all = true;
126
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
127
 
128
+ llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
129
+
130
+ // load the prompts from an external file if there are any
131
+ if (params.prompt.empty()) {
132
+ printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
133
+ } else {
134
+ // Output each line of the input params.prompts vector and copy to k_prompts
135
+ int index = 0;
136
+ printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
137
+
138
+ std::vector<std::string> prompts = split_string(params.prompt, '\n');
139
+ for (const auto& prompt : prompts) {
140
+ k_prompts.resize(index + 1);
141
+ k_prompts[index] = prompt;
142
+ index++;
143
+ printf("%3d prompt: %s\n", index, prompt.c_str());
144
+ }
145
+ }
146
+
147
  fprintf(stderr, "\n\n");
148
  fflush(stderr);
149
 
 
169
 
170
  // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
171
  // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
172
+ llama_batch batch = llama_batch_init(n_ctx, 0);
173
 
174
  int32_t n_total_prompt = 0;
175
  int32_t n_total_gen = 0;
 
273
  client.n_decoded = 0;
274
  client.i_batch = batch.n_tokens - 1;
275
 
276
+ LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
277
 
278
  g_seq_id += 1;
279
 
 
341
  //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
342
  // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
343
 
344
+ const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
345
 
346
  if (client.n_decoded == 1) {
347
  // start measuring generation time after the first token to make sure all concurrent clients
 
372
  }
373
 
374
  // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
375
+ llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
376
 
377
  const auto t_main_end = ggml_time_us();
378
 
379
+ LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
380
+ client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
381
  (t_main_end - client.t_start_prompt) / 1e6,
382
  (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
383
  n_cache_miss,
 
386
 
387
  n_total_prompt += client.n_prompt;
388
  n_total_gen += client.n_decoded;
389
+ llama_sampling_context_reset(ctx_sampling, client.seq_id);
390
  client.seq_id = -1;
391
  }
392
 
 
397
 
398
  const auto t_main_end = ggml_time_us();
399
 
400
+ print_date_time();
401
+
402
+ LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
403
+ if (params.prompt_file.empty()) {
404
+ params.prompt_file = "used built-in defaults";
405
+ }
406
+ LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
407
+ LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
408
+
409
  LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
410
  LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
411
  LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
412
  LOG_TEE("Cache misses: %6d\n", n_cache_miss);
413
 
414
+ LOG_TEE("\n");
415
 
416
  llama_print_timings(ctx);
417
 
examples/save-load-state/save-load-state.cpp CHANGED
@@ -8,9 +8,10 @@
8
 
9
  int main(int argc, char ** argv) {
10
  gpt_params params;
 
11
  params.seed = 42;
12
  params.n_threads = 4;
13
- params.repeat_last_n = 64;
14
  params.prompt = "The quick brown fox";
15
 
16
  if (!gpt_params_parse(argc, argv, params)) {
@@ -24,7 +25,7 @@ int main(int argc, char ** argv) {
24
  }
25
 
26
  auto n_past = 0;
27
- auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
28
 
29
  // init
30
  llama_model * model;
 
8
 
9
  int main(int argc, char ** argv) {
10
  gpt_params params;
11
+ llama_sampling_params & sparams = params.sampling_params;
12
  params.seed = 42;
13
  params.n_threads = 4;
14
+ sparams.repeat_last_n = 64;
15
  params.prompt = "The quick brown fox";
16
 
17
  if (!gpt_params_parse(argc, argv, params)) {
 
25
  }
26
 
27
  auto n_past = 0;
28
+ auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
29
 
30
  // init
31
  llama_model * model;
examples/server/index.html.hpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/server/public/index.html CHANGED
@@ -136,6 +136,11 @@
136
  display: block;
137
  }
138
 
 
 
 
 
 
139
  header, footer {
140
  text-align: center;
141
  }
@@ -145,6 +150,14 @@
145
  color: #888;
146
  }
147
 
 
 
 
 
 
 
 
 
148
 
149
  @keyframes loading-bg-wipe {
150
  0% {
@@ -187,7 +200,7 @@
187
  template: "{{prompt}}\n\n{{history}}\n{{char}}:",
188
  historyTemplate: "{{name}}: {{message}}",
189
  transcript: [],
190
- type: "chat",
191
  char: "Llama",
192
  user: "User",
193
  })
@@ -365,13 +378,44 @@
365
  return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
366
  }
367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  // send message to server
369
  const chat = async (msg) => {
370
  if (controller.value) {
371
  console.log('already running...');
372
  return;
373
  }
374
- controller.value = new AbortController();
375
 
376
  transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
377
 
@@ -391,55 +435,41 @@
391
  ).join("\n"),
392
  });
393
 
394
- const currentMessages = [];
395
- const history = session.value.transcript
396
-
397
- const llamaParams = {
398
  ...params.value,
399
  stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
400
- }
401
-
402
- for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
403
- const data = chunk.data;
404
 
405
- if (data.stop) {
406
- while (
407
- currentMessages.length > 0 &&
408
- currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
409
- ) {
410
- currentMessages.pop();
411
- }
412
- transcriptUpdate([...history, ["{{char}}", currentMessages]])
413
- console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
414
- } else {
415
- currentMessages.push(data);
416
- transcriptUpdate([...history, ["{{char}}", currentMessages]])
417
- }
418
 
419
- if (data.timings) {
420
- llamaStats.value = data.timings;
421
- }
 
 
422
  }
 
423
 
424
- controller.value = null;
 
 
425
  }
426
 
427
  function MessageInput() {
428
  const message = useSignal("")
429
 
430
- const stop = (e) => {
431
- e.preventDefault();
432
- if (controller.value) {
433
- controller.value.abort();
434
- controller.value = null;
435
- }
436
- }
437
-
438
- const reset = (e) => {
439
- stop(e);
440
- transcriptUpdate([]);
441
- }
442
-
443
  const submit = (e) => {
444
  stop(e);
445
  chat(message.value);
@@ -474,6 +504,19 @@
474
  `
475
  }
476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  const ChatLog = (props) => {
478
  const messages = session.value.transcript;
479
  const container = useRef(null)
@@ -497,7 +540,11 @@
497
  data;
498
  message = html`<${Markdownish} text=${template(text)} />`
499
  }
500
- return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
 
 
 
 
501
  };
502
 
503
  return html`
@@ -574,18 +621,31 @@
574
  userTemplateAutosave()
575
  }, [session.value, params.value])
576
 
577
- return html`
578
- <form>
579
- <fieldset>
580
- <${UserTemplateResetButton}/>
581
- </fieldset>
 
 
 
 
 
582
 
583
- <fieldset>
584
- <div>
585
- <label for="prompt">Prompt</label>
586
- <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
587
- </div>
588
- </fieldset>
 
 
 
 
 
 
 
 
589
 
590
  <fieldset class="two">
591
  <div>
@@ -609,15 +669,30 @@
609
  <label for="template">Chat history template</label>
610
  <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
611
  </div>
 
 
 
 
 
 
 
 
 
 
 
612
 
 
 
 
 
613
  <div>
614
- <label for="template">Grammar</label>
615
- <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
616
- <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
617
- <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
618
  </div>
619
  </fieldset>
620
 
 
 
621
  <fieldset class="two">
622
  ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
623
  ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
@@ -851,7 +926,7 @@
851
  function App(props) {
852
 
853
  return html`
854
- <div>
855
  <header>
856
  <h1>llama.cpp</h1>
857
  </header>
@@ -861,7 +936,7 @@
861
  </main>
862
 
863
  <section id="write">
864
- <${MessageInput} />
865
  </section>
866
 
867
  <footer>
 
136
  display: block;
137
  }
138
 
139
+ fieldset label.slim {
140
+ margin: 0 0.5em;
141
+ display: inline;
142
+ }
143
+
144
  header, footer {
145
  text-align: center;
146
  }
 
150
  color: #888;
151
  }
152
 
153
+ .mode-chat textarea[name=prompt] {
154
+ height: 4.5em;
155
+ }
156
+
157
+ .mode-completion textarea[name=prompt] {
158
+ height: 10em;
159
+ }
160
+
161
 
162
  @keyframes loading-bg-wipe {
163
  0% {
 
200
  template: "{{prompt}}\n\n{{history}}\n{{char}}:",
201
  historyTemplate: "{{name}}: {{message}}",
202
  transcript: [],
203
+ type: "chat", // "chat" | "completion"
204
  char: "Llama",
205
  user: "User",
206
  })
 
378
  return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
379
  }
380
 
381
+ async function runLlama(prompt, llamaParams, char) {
382
+ const currentMessages = [];
383
+ const history = session.value.transcript;
384
+ if (controller.value) {
385
+ throw new Error("already running");
386
+ }
387
+ controller.value = new AbortController();
388
+ for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
389
+ const data = chunk.data;
390
+
391
+ if (data.stop) {
392
+ while (
393
+ currentMessages.length > 0 &&
394
+ currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
395
+ ) {
396
+ currentMessages.pop();
397
+ }
398
+ transcriptUpdate([...history, [char, currentMessages]])
399
+ console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
400
+ } else {
401
+ currentMessages.push(data);
402
+ transcriptUpdate([...history, [char, currentMessages]])
403
+ }
404
+
405
+ if (data.timings) {
406
+ llamaStats.value = data.timings;
407
+ }
408
+ }
409
+
410
+ controller.value = null;
411
+ }
412
+
413
  // send message to server
414
  const chat = async (msg) => {
415
  if (controller.value) {
416
  console.log('already running...');
417
  return;
418
  }
 
419
 
420
  transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
421
 
 
435
  ).join("\n"),
436
  });
437
 
438
+ await runLlama(prompt, {
 
 
 
439
  ...params.value,
440
  stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
441
+ }, "{{char}}");
442
+ }
 
 
443
 
444
+ const runCompletion = async () => {
445
+ if (controller.value) {
446
+ console.log('already running...');
447
+ return;
448
+ }
449
+ const {prompt} = session.value;
450
+ transcriptUpdate([...session.value.transcript, ["", prompt]]);
451
+ await runLlama(prompt, {
452
+ ...params.value,
453
+ stop: [],
454
+ }, "");
455
+ }
 
456
 
457
+ const stop = (e) => {
458
+ e.preventDefault();
459
+ if (controller.value) {
460
+ controller.value.abort();
461
+ controller.value = null;
462
  }
463
+ }
464
 
465
+ const reset = (e) => {
466
+ stop(e);
467
+ transcriptUpdate([]);
468
  }
469
 
470
  function MessageInput() {
471
  const message = useSignal("")
472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  const submit = (e) => {
474
  stop(e);
475
  chat(message.value);
 
504
  `
505
  }
506
 
507
+ function CompletionControls() {
508
+ const submit = (e) => {
509
+ stop(e);
510
+ runCompletion();
511
+ }
512
+ return html`
513
+ <div>
514
+ <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
515
+ <button onclick=${stop} disabled=${!generating.value}>Stop</button>
516
+ <button onclick=${reset}>Reset</button>
517
+ </div>`;
518
+ }
519
+
520
  const ChatLog = (props) => {
521
  const messages = session.value.transcript;
522
  const container = useRef(null)
 
540
  data;
541
  message = html`<${Markdownish} text=${template(text)} />`
542
  }
543
+ if(user) {
544
+ return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
545
+ } else {
546
+ return html`<p key=${index}>${message}</p>`
547
+ }
548
  };
549
 
550
  return html`
 
621
  userTemplateAutosave()
622
  }, [session.value, params.value])
623
 
624
+ const GrammarControl = () => (
625
+ html`
626
+ <div>
627
+ <label for="template">Grammar</label>
628
+ <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
629
+ <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
630
+ <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
631
+ </div>
632
+ `
633
+ );
634
 
635
+ const PromptControlFieldSet = () => (
636
+ html`
637
+ <fieldset>
638
+ <div>
639
+ <label htmlFor="prompt">Prompt</label>
640
+ <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
641
+ </div>
642
+ </fieldset>
643
+ `
644
+ );
645
+
646
+ const ChatConfigForm = () => (
647
+ html`
648
+ ${PromptControlFieldSet()}
649
 
650
  <fieldset class="two">
651
  <div>
 
669
  <label for="template">Chat history template</label>
670
  <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
671
  </div>
672
+ ${GrammarControl()}
673
+ </fieldset>
674
+ `
675
+ );
676
+
677
+ const CompletionConfigForm = () => (
678
+ html`
679
+ ${PromptControlFieldSet()}
680
+ <fieldset>${GrammarControl()}</fieldset>
681
+ `
682
+ );
683
 
684
+ return html`
685
+ <form>
686
+ <fieldset class="two">
687
+ <${UserTemplateResetButton}/>
688
  <div>
689
+ <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
690
+ <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
 
 
691
  </div>
692
  </fieldset>
693
 
694
+ ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
695
+
696
  <fieldset class="two">
697
  ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
698
  ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
 
926
  function App(props) {
927
 
928
  return html`
929
+ <div class="mode-${session.value.type}">
930
  <header>
931
  <h1>llama.cpp</h1>
932
  </header>
 
936
  </main>
937
 
938
  <section id="write">
939
+ <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
940
  </section>
941
 
942
  <footer>
examples/server/server.cpp CHANGED
@@ -200,6 +200,7 @@ struct llama_server_context
200
  llama_model *model = nullptr;
201
  llama_context *ctx = nullptr;
202
  gpt_params params;
 
203
  int n_ctx;
204
 
205
  grammar_parser::parse_state parsed_grammar;
@@ -254,6 +255,7 @@ struct llama_server_context
254
  if (grammar != nullptr) {
255
  llama_grammar_free(grammar);
256
  grammar = nullptr;
 
257
  }
258
  }
259
 
@@ -329,8 +331,8 @@ struct llama_server_context
329
  grammar_parser::print_grammar(stderr, parsed_grammar);
330
 
331
  {
332
- auto it = params.logit_bias.find(llama_token_eos(ctx));
333
- if (it != params.logit_bias.end() && it->second == -INFINITY) {
334
  LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
335
  }
336
  }
@@ -339,9 +341,89 @@ struct llama_server_context
339
  grammar = llama_grammar_init(
340
  grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
341
  }
 
342
  return true;
343
  }
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  void loadPrompt()
346
  {
347
  auto prompt_tokens = tokenize(prompt, true); // always add BOS
@@ -383,9 +465,6 @@ struct llama_server_context
383
  // compare the evaluated prompt with the new prompt
384
  n_past = common_part(embd, prompt_tokens);
385
 
386
- // since #3228 we now have to manually manage the KV cache
387
- llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
388
-
389
  embd = prompt_tokens;
390
  if (n_past == num_prompt_tokens)
391
  {
@@ -393,6 +472,9 @@ struct llama_server_context
393
  n_past--;
394
  }
395
 
 
 
 
396
  LOG_VERBOSE("prompt ingested", {
397
  {"n_past", n_past},
398
  {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -440,9 +522,11 @@ struct llama_server_context
440
  });
441
  }
442
 
 
443
  while (n_past < embd.size())
444
  {
445
  int n_eval = (int)embd.size() - n_past;
 
446
  if (n_eval > params.n_batch)
447
  {
448
  n_eval = params.n_batch;
@@ -468,98 +552,20 @@ struct llama_server_context
468
  return result;
469
  }
470
 
471
- // out of user input, sample next token
472
- const float temp = params.temp;
473
- const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
474
- const float top_p = params.top_p;
475
- const float tfs_z = params.tfs_z;
476
- const float typical_p = params.typical_p;
477
- const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
478
- const float repeat_penalty = params.repeat_penalty;
479
- const float alpha_presence = params.presence_penalty;
480
- const float alpha_frequency = params.frequency_penalty;
481
- const int mirostat = params.mirostat;
482
- const float mirostat_tau = params.mirostat_tau;
483
- const float mirostat_eta = params.mirostat_eta;
484
- const bool penalize_nl = params.penalize_nl;
485
- const int32_t n_probs = params.n_probs;
486
-
487
  {
488
- auto *logits = llama_get_logits(ctx);
489
- auto n_vocab = llama_n_vocab(model);
490
-
491
- // Apply params.logit_bias map
492
- for (const auto &it : params.logit_bias)
493
- {
494
- logits[it.first] += it.second;
495
- }
496
-
497
  std::vector<llama_token_data> candidates;
498
- candidates.reserve(n_vocab);
499
- for (llama_token token_id = 0; token_id < n_vocab; token_id++)
500
- {
501
- candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
502
- }
503
 
504
- llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
505
-
506
- // Apply penalties
507
- float nl_logit = logits[llama_token_nl(ctx)];
508
- auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
509
- llama_sample_repetition_penalty(ctx, &candidates_p,
510
- last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
511
- last_n_repeat, repeat_penalty);
512
- llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
513
- last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
514
- last_n_repeat, alpha_frequency, alpha_presence);
515
- if (!penalize_nl)
516
- {
517
- logits[llama_token_nl(ctx)] = nl_logit;
518
- }
519
 
520
- if (grammar != nullptr) {
521
- llama_sample_grammar(ctx, &candidates_p, grammar);
522
- }
523
 
524
- if (temp <= 0)
 
525
  {
526
- // Greedy sampling
527
- result.tok = llama_sample_token_greedy(ctx, &candidates_p);
528
- if (n_probs > 0)
529
- {
530
- llama_sample_softmax(ctx, &candidates_p);
531
- }
532
- }
533
- else
534
- {
535
- if (mirostat == 1)
536
- {
537
- static float mirostat_mu = 2.0f * mirostat_tau;
538
- const int mirostat_m = 100;
539
- llama_sample_temp(ctx, &candidates_p, temp);
540
- result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
541
- }
542
- else if (mirostat == 2)
543
- {
544
- static float mirostat_mu = 2.0f * mirostat_tau;
545
- llama_sample_temp(ctx, &candidates_p, temp);
546
- result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
547
- }
548
- else
549
- {
550
- // Temperature sampling
551
- size_t min_keep = std::max(1, n_probs);
552
- llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
553
- llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
554
- llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
555
- llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
556
- llama_sample_temp(ctx, &candidates_p, temp);
557
- result.tok = llama_sample_token(ctx, &candidates_p);
558
- }
559
- }
560
-
561
- if (grammar != nullptr) {
562
- llama_grammar_accept_token(ctx, grammar, result.tok);
563
  }
564
 
565
  for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
@@ -569,7 +575,9 @@ struct llama_server_context
569
 
570
  last_n_tokens.erase(last_n_tokens.begin());
571
  last_n_tokens.push_back(result.tok);
572
- num_tokens_predicted++;
 
 
573
  }
574
 
575
  // add it to the context
@@ -629,7 +637,7 @@ struct llama_server_context
629
  const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
630
  generated_text += token_text;
631
 
632
- if (params.n_probs > 0)
633
  {
634
  generated_token_probs.push_back(token_with_probs);
635
  }
@@ -710,15 +718,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
710
  printf("usage: %s [options]\n", argv0);
711
  printf("\n");
712
  printf("options:\n");
713
- printf(" -h, --help show this help message and exit\n");
714
- printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
715
- printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
716
- printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
717
- printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
718
- printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
719
- printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
720
- printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
721
- printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
 
722
  if (llama_mlock_supported())
723
  {
724
  printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
@@ -863,6 +872,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
863
  }
864
  params.n_threads = std::stoi(argv[i]);
865
  }
 
 
 
 
 
 
 
 
 
866
  else if (arg == "-b" || arg == "--batch-size")
867
  {
868
  if (++i >= argc)
@@ -947,7 +965,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
947
  invalid_param = true;
948
  break;
949
  }
950
- params.lora_adapter.push_back({argv[i], 1.0f});
951
  params.use_mmap = false;
952
  }
953
  else if (arg == "--lora-scaled")
@@ -963,7 +981,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
963
  invalid_param = true;
964
  break;
965
  }
966
- params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
967
  params.use_mmap = false;
968
  }
969
  else if (arg == "--lora-base")
@@ -1017,34 +1035,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
1017
 
1018
  static json format_generation_settings(llama_server_context &llama)
1019
  {
1020
- const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
1021
- const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
 
1022
  eos_bias->second < 0.0f && std::isinf(eos_bias->second);
1023
 
1024
  return json{
1025
  {"n_ctx", llama.n_ctx},
1026
  {"model", llama.params.model_alias},
1027
  {"seed", llama.params.seed},
1028
- {"temp", llama.params.temp},
1029
- {"top_k", llama.params.top_k},
1030
- {"top_p", llama.params.top_p},
1031
- {"tfs_z", llama.params.tfs_z},
1032
- {"typical_p", llama.params.typical_p},
1033
- {"repeat_last_n", llama.params.repeat_last_n},
1034
- {"repeat_penalty", llama.params.repeat_penalty},
1035
- {"presence_penalty", llama.params.presence_penalty},
1036
- {"frequency_penalty", llama.params.frequency_penalty},
1037
- {"mirostat", llama.params.mirostat},
1038
- {"mirostat_tau", llama.params.mirostat_tau},
1039
- {"mirostat_eta", llama.params.mirostat_eta},
1040
- {"penalize_nl", llama.params.penalize_nl},
1041
  {"stop", llama.params.antiprompt},
1042
  {"n_predict", llama.params.n_predict},
1043
  {"n_keep", llama.params.n_keep},
1044
  {"ignore_eos", ignore_eos},
1045
  {"stream", llama.stream},
1046
- {"logit_bias", llama.params.logit_bias},
1047
- {"n_probs", llama.params.n_probs},
1048
  {"grammar", llama.params.grammar},
1049
  };
1050
  }
@@ -1060,8 +1079,6 @@ static json format_timings(llama_server_context &llama)
1060
  {
1061
  const auto timings = llama_get_timings(llama.ctx);
1062
 
1063
- assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
1064
-
1065
  return json{
1066
  {"prompt_n", timings.n_p_eval},
1067
  {"prompt_ms", timings.t_p_eval_ms},
@@ -1095,7 +1112,7 @@ static json format_final_response(llama_server_context &llama, const std::string
1095
  {"timings", format_timings(llama)},
1096
  };
1097
 
1098
- if (llama.params.n_probs > 0)
1099
  {
1100
  res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
1101
  }
@@ -1111,7 +1128,7 @@ static json format_partial_response(
1111
  {"stop", false},
1112
  };
1113
 
1114
- if (llama.params.n_probs > 0)
1115
  {
1116
  res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
1117
  }
@@ -1143,26 +1160,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
1143
  static void parse_options_completion(const json &body, llama_server_context &llama)
1144
  {
1145
  gpt_params default_params;
 
 
1146
 
1147
  llama.stream = json_value(body, "stream", false);
1148
  llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
1149
- llama.params.top_k = json_value(body, "top_k", default_params.top_k);
1150
- llama.params.top_p = json_value(body, "top_p", default_params.top_p);
1151
- llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
1152
- llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
1153
- llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
1154
- llama.params.temp = json_value(body, "temperature", default_params.temp);
1155
- llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
1156
- llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
1157
- llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
1158
- llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
1159
- llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
1160
- llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
1161
- llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
1162
  llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
1163
  llama.params.seed = json_value(body, "seed", default_params.seed);
1164
  llama.params.grammar = json_value(body, "grammar", default_params.grammar);
1165
- llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
1166
 
1167
  if (body.count("prompt") != 0)
1168
  {
@@ -1173,10 +1192,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
1173
  llama.prompt = "";
1174
  }
1175
 
1176
- llama.params.logit_bias.clear();
1177
  if (json_value(body, "ignore_eos", false))
1178
  {
1179
- llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
1180
  }
1181
 
1182
  const auto &logit_bias = body.find("logit_bias");
@@ -1192,11 +1211,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
1192
  {
1193
  if (el[1].is_number())
1194
  {
1195
- llama.params.logit_bias[tok] = el[1].get<float>();
1196
  }
1197
  else if (el[1].is_boolean() && !el[1].get<bool>())
1198
  {
1199
- llama.params.logit_bias[tok] = -INFINITY;
1200
  }
1201
  }
1202
  }
@@ -1216,9 +1235,32 @@ static void parse_options_completion(const json &body, llama_server_context &lla
1216
  }
1217
  }
1218
 
 
 
1219
  LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
1220
  }
1221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1222
  static void log_server_request(const Request &req, const Response &res)
1223
  {
1224
  LOG_INFO("request", {
@@ -1403,7 +1445,7 @@ int main(int argc, char **argv)
1403
  }
1404
 
1405
  auto probs = llama.generated_token_probs;
1406
- if (llama.params.n_probs > 0 && llama.stopped_word) {
1407
  const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
1408
  probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
1409
  }
@@ -1455,7 +1497,7 @@ int main(int argc, char **argv)
1455
 
1456
  std::vector<completion_token_output> probs_output = {};
1457
 
1458
- if (llama.params.n_probs > 0) {
1459
  const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
1460
  size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
1461
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1519,6 +1561,127 @@ int main(int argc, char **argv)
1519
  res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
1520
  } });
1521
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1522
  svr.Get("/model.json", [&llama](const Request &, Response &res)
1523
  {
1524
  const json data = format_generation_settings(llama);
 
200
  llama_model *model = nullptr;
201
  llama_context *ctx = nullptr;
202
  gpt_params params;
203
+ llama_sampling_context ctx_sampling;
204
  int n_ctx;
205
 
206
  grammar_parser::parse_state parsed_grammar;
 
255
  if (grammar != nullptr) {
256
  llama_grammar_free(grammar);
257
  grammar = nullptr;
258
+ ctx_sampling = llama_sampling_context_init(params, NULL);
259
  }
260
  }
261
 
 
331
  grammar_parser::print_grammar(stderr, parsed_grammar);
332
 
333
  {
334
+ auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
335
+ if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
336
  LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
337
  }
338
  }
 
341
  grammar = llama_grammar_init(
342
  grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
343
  }
344
+ ctx_sampling = llama_sampling_context_init(params, grammar);
345
  return true;
346
  }
347
 
348
+ void loadInfill()
349
+ {
350
+ bool suff_rm_leading_spc = true;
351
+ if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
352
+ params.input_suffix.erase(0, 1);
353
+ suff_rm_leading_spc = false;
354
+ }
355
+
356
+ auto prefix_tokens = tokenize(params.input_prefix, false);
357
+ auto suffix_tokens = tokenize(params.input_suffix, false);
358
+ const int space_token = 29871;
359
+ if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
360
+ suffix_tokens.erase(suffix_tokens.begin());
361
+ }
362
+ prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
363
+ prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
364
+ prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
365
+ prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
366
+ prefix_tokens.push_back(llama_token_middle(ctx));
367
+ auto prompt_tokens = prefix_tokens;
368
+
369
+ num_prompt_tokens = prompt_tokens.size();
370
+
371
+ if (params.n_keep < 0)
372
+ {
373
+ params.n_keep = (int)num_prompt_tokens;
374
+ }
375
+ params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
376
+
377
+ // if input prompt is too big, truncate like normal
378
+ if (num_prompt_tokens >= (size_t)params.n_ctx)
379
+ {
380
+ printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
381
+ // todo we probably want to cut from both sides
382
+ const int n_left = (params.n_ctx - params.n_keep) / 2;
383
+ std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
384
+ const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
385
+ new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
386
+ std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
387
+
388
+ LOG_VERBOSE("input truncated", {
389
+ {"n_ctx", params.n_ctx},
390
+ {"n_keep", params.n_keep},
391
+ {"n_left", n_left},
392
+ {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
393
+ });
394
+
395
+ truncated = true;
396
+ prompt_tokens = new_tokens;
397
+ }
398
+ else
399
+ {
400
+ const size_t ps = num_prompt_tokens;
401
+ std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
402
+ std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
403
+ }
404
+
405
+ // compare the evaluated prompt with the new prompt
406
+ n_past = common_part(embd, prompt_tokens);
407
+ embd = prompt_tokens;
408
+
409
+ if (n_past == num_prompt_tokens)
410
+ {
411
+ // we have to evaluate at least 1 token to generate logits.
412
+ printf("we have to evaluate at least 1 token to generate logits\n");
413
+ n_past--;
414
+ }
415
+
416
+ // since #3228 we now have to manually manage the KV cache
417
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
418
+
419
+ LOG_VERBOSE("prompt ingested", {
420
+ {"n_past", n_past},
421
+ {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
422
+ {"to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
423
+ });
424
+
425
+ has_next_token = true;
426
+ }
427
  void loadPrompt()
428
  {
429
  auto prompt_tokens = tokenize(prompt, true); // always add BOS
 
465
  // compare the evaluated prompt with the new prompt
466
  n_past = common_part(embd, prompt_tokens);
467
 
 
 
 
468
  embd = prompt_tokens;
469
  if (n_past == num_prompt_tokens)
470
  {
 
472
  n_past--;
473
  }
474
 
475
+ // since #3228 we now have to manually manage the KV cache
476
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
477
+
478
  LOG_VERBOSE("prompt ingested", {
479
  {"n_past", n_past},
480
  {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
 
522
  });
523
  }
524
 
525
+ bool tg = true;
526
  while (n_past < embd.size())
527
  {
528
  int n_eval = (int)embd.size() - n_past;
529
+ tg = n_eval == 1;
530
  if (n_eval > params.n_batch)
531
  {
532
  n_eval = params.n_batch;
 
552
  return result;
553
  }
554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  {
556
+ // out of user input, sample next token
 
 
 
 
 
 
 
 
557
  std::vector<llama_token_data> candidates;
558
+ candidates.reserve(llama_n_vocab(model));
 
 
 
 
559
 
560
+ result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
 
562
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
 
563
 
564
+ const int32_t n_probs = params.sampling_params.n_probs;
565
+ if (params.sampling_params.temp <= 0 && n_probs > 0)
566
  {
567
+ // For llama_sample_token_greedy we need to sort candidates
568
+ llama_sample_softmax(ctx, &candidates_p);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  }
570
 
571
  for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
 
575
 
576
  last_n_tokens.erase(last_n_tokens.begin());
577
  last_n_tokens.push_back(result.tok);
578
+ if (tg) {
579
+ num_tokens_predicted++;
580
+ }
581
  }
582
 
583
  // add it to the context
 
637
  const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
638
  generated_text += token_text;
639
 
640
+ if (params.sampling_params.n_probs > 0)
641
  {
642
  generated_token_probs.push_back(token_with_probs);
643
  }
 
718
  printf("usage: %s [options]\n", argv0);
719
  printf("\n");
720
  printf("options:\n");
721
+ printf(" -h, --help show this help message and exit\n");
722
+ printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
723
+ printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
724
+ printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
725
+ printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
726
+ printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
727
+ printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
728
+ printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
729
+ printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
730
+ printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
731
  if (llama_mlock_supported())
732
  {
733
  printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
 
872
  }
873
  params.n_threads = std::stoi(argv[i]);
874
  }
875
+ else if (arg == "--threads-batch" || arg == "-tb")
876
+ {
877
+ if (++i >= argc)
878
+ {
879
+ invalid_param = true;
880
+ break;
881
+ }
882
+ params.n_threads_batch = std::stoi(argv[i]);
883
+ }
884
  else if (arg == "-b" || arg == "--batch-size")
885
  {
886
  if (++i >= argc)
 
965
  invalid_param = true;
966
  break;
967
  }
968
+ params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
969
  params.use_mmap = false;
970
  }
971
  else if (arg == "--lora-scaled")
 
981
  invalid_param = true;
982
  break;
983
  }
984
+ params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
985
  params.use_mmap = false;
986
  }
987
  else if (arg == "--lora-base")
 
1035
 
1036
  static json format_generation_settings(llama_server_context &llama)
1037
  {
1038
+ const auto & sparams = llama.params.sampling_params;
1039
+ const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
1040
+ const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
1041
  eos_bias->second < 0.0f && std::isinf(eos_bias->second);
1042
 
1043
  return json{
1044
  {"n_ctx", llama.n_ctx},
1045
  {"model", llama.params.model_alias},
1046
  {"seed", llama.params.seed},
1047
+ {"temp", sparams.temp},
1048
+ {"top_k", sparams.top_k},
1049
+ {"top_p", sparams.top_p},
1050
+ {"tfs_z", sparams.tfs_z},
1051
+ {"typical_p", sparams.typical_p},
1052
+ {"repeat_last_n", sparams.repeat_last_n},
1053
+ {"repeat_penalty", sparams.repeat_penalty},
1054
+ {"presence_penalty", sparams.presence_penalty},
1055
+ {"frequency_penalty", sparams.frequency_penalty},
1056
+ {"mirostat", sparams.mirostat},
1057
+ {"mirostat_tau", sparams.mirostat_tau},
1058
+ {"mirostat_eta", sparams.mirostat_eta},
1059
+ {"penalize_nl", sparams.penalize_nl},
1060
  {"stop", llama.params.antiprompt},
1061
  {"n_predict", llama.params.n_predict},
1062
  {"n_keep", llama.params.n_keep},
1063
  {"ignore_eos", ignore_eos},
1064
  {"stream", llama.stream},
1065
+ {"logit_bias", sparams.logit_bias},
1066
+ {"n_probs", sparams.n_probs},
1067
  {"grammar", llama.params.grammar},
1068
  };
1069
  }
 
1079
  {
1080
  const auto timings = llama_get_timings(llama.ctx);
1081
 
 
 
1082
  return json{
1083
  {"prompt_n", timings.n_p_eval},
1084
  {"prompt_ms", timings.t_p_eval_ms},
 
1112
  {"timings", format_timings(llama)},
1113
  };
1114
 
1115
+ if (llama.params.sampling_params.n_probs > 0)
1116
  {
1117
  res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
1118
  }
 
1128
  {"stop", false},
1129
  };
1130
 
1131
+ if (llama.params.sampling_params.n_probs > 0)
1132
  {
1133
  res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
1134
  }
 
1160
  static void parse_options_completion(const json &body, llama_server_context &llama)
1161
  {
1162
  gpt_params default_params;
1163
+ const auto & default_sparams = default_params.sampling_params;
1164
+ auto & sparams = llama.params.sampling_params;
1165
 
1166
  llama.stream = json_value(body, "stream", false);
1167
  llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
1168
+ sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
1169
+ sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
1170
+ sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
1171
+ sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
1172
+ sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
1173
+ sparams.temp = json_value(body, "temperature", default_sparams.temp);
1174
+ sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
1175
+ sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
1176
+ sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
1177
+ sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
1178
+ sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
1179
+ sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
1180
+ sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
1181
  llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
1182
  llama.params.seed = json_value(body, "seed", default_params.seed);
1183
  llama.params.grammar = json_value(body, "grammar", default_params.grammar);
1184
+ sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
1185
 
1186
  if (body.count("prompt") != 0)
1187
  {
 
1192
  llama.prompt = "";
1193
  }
1194
 
1195
+ sparams.logit_bias.clear();
1196
  if (json_value(body, "ignore_eos", false))
1197
  {
1198
+ sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
1199
  }
1200
 
1201
  const auto &logit_bias = body.find("logit_bias");
 
1211
  {
1212
  if (el[1].is_number())
1213
  {
1214
+ sparams.logit_bias[tok] = el[1].get<float>();
1215
  }
1216
  else if (el[1].is_boolean() && !el[1].get<bool>())
1217
  {
1218
+ sparams.logit_bias[tok] = -INFINITY;
1219
  }
1220
  }
1221
  }
 
1235
  }
1236
  }
1237
 
1238
+ llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
1239
+
1240
  LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
1241
  }
1242
 
1243
+ static void parse_options_infill(const json &body, llama_server_context &llama)
1244
+ {
1245
+ if (body.count("input_prefix") != 0)
1246
+ {
1247
+ llama.params.input_prefix = body["input_prefix"];
1248
+ }
1249
+ else
1250
+ {
1251
+ llama.params.input_prefix = "";
1252
+ }
1253
+ if (body.count("input_suffix") != 0)
1254
+ {
1255
+ llama.params.input_suffix = body["input_suffix"];
1256
+ }
1257
+ else
1258
+ {
1259
+ llama.params.input_suffix = "";
1260
+ }
1261
+ parse_options_completion(body, llama);
1262
+ }
1263
+
1264
  static void log_server_request(const Request &req, const Response &res)
1265
  {
1266
  LOG_INFO("request", {
 
1445
  }
1446
 
1447
  auto probs = llama.generated_token_probs;
1448
+ if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
1449
  const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
1450
  probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
1451
  }
 
1497
 
1498
  std::vector<completion_token_output> probs_output = {};
1499
 
1500
+ if (llama.params.sampling_params.n_probs > 0) {
1501
  const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
1502
  size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
1503
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
 
1561
  res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
1562
  } });
1563
 
1564
+ svr.Post("/infill", [&llama](const Request &req, Response &res)
1565
+ {
1566
+ auto lock = llama.lock();
1567
+
1568
+ llama.rewind();
1569
+
1570
+ llama_reset_timings(llama.ctx);
1571
+
1572
+ parse_options_infill(json::parse(req.body), llama);
1573
+
1574
+ if (!llama.loadGrammar())
1575
+ {
1576
+ res.status = 400;
1577
+ return;
1578
+ }
1579
+ llama.loadInfill();
1580
+ llama.beginCompletion();
1581
+ const auto chunked_content_provider = [&](size_t, DataSink & sink) {
1582
+ size_t sent_count = 0;
1583
+ size_t sent_token_probs_index = 0;
1584
+
1585
+ while (llama.has_next_token) {
1586
+ const completion_token_output token_with_probs = llama.doCompletion();
1587
+ if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
1588
+ continue;
1589
+ }
1590
+ const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
1591
+
1592
+ size_t pos = std::min(sent_count, llama.generated_text.size());
1593
+
1594
+ const std::string str_test = llama.generated_text.substr(pos);
1595
+ bool is_stop_full = false;
1596
+ size_t stop_pos =
1597
+ llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
1598
+ if (stop_pos != std::string::npos) {
1599
+ is_stop_full = true;
1600
+ llama.generated_text.erase(
1601
+ llama.generated_text.begin() + pos + stop_pos,
1602
+ llama.generated_text.end());
1603
+ pos = std::min(sent_count, llama.generated_text.size());
1604
+ } else {
1605
+ is_stop_full = false;
1606
+ stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
1607
+ STOP_PARTIAL);
1608
+ }
1609
+
1610
+ if (
1611
+ stop_pos == std::string::npos ||
1612
+ // Send rest of the text if we are at the end of the generation
1613
+ (!llama.has_next_token && !is_stop_full && stop_pos > 0)
1614
+ ) {
1615
+ const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
1616
+
1617
+ sent_count += to_send.size();
1618
+
1619
+ std::vector<completion_token_output> probs_output = {};
1620
+
1621
+ if (llama.params.sampling_params.n_probs > 0) {
1622
+ const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
1623
+ size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
1624
+ size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
1625
+ if (probs_pos < probs_stop_pos) {
1626
+ probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
1627
+ }
1628
+ sent_token_probs_index = probs_stop_pos;
1629
+ }
1630
+
1631
+ const json data = format_partial_response(llama, to_send, probs_output);
1632
+
1633
+ const std::string str =
1634
+ "data: " +
1635
+ data.dump(-1, ' ', false, json::error_handler_t::replace) +
1636
+ "\n\n";
1637
+
1638
+ LOG_VERBOSE("data stream", {
1639
+ { "to_send", str }
1640
+ });
1641
+
1642
+ if (!sink.write(str.data(), str.size())) {
1643
+ LOG_VERBOSE("stream closed", {});
1644
+ llama_print_timings(llama.ctx);
1645
+ return false;
1646
+ }
1647
+ }
1648
+
1649
+ if (!llama.has_next_token) {
1650
+ // Generation is done, send extra information.
1651
+ const json data = format_final_response(
1652
+ llama,
1653
+ "",
1654
+ std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
1655
+ );
1656
+
1657
+ const std::string str =
1658
+ "data: " +
1659
+ data.dump(-1, ' ', false, json::error_handler_t::replace) +
1660
+ "\n\n";
1661
+
1662
+ LOG_VERBOSE("data stream", {
1663
+ { "to_send", str }
1664
+ });
1665
+
1666
+ if (!sink.write(str.data(), str.size())) {
1667
+ LOG_VERBOSE("stream closed", {});
1668
+ llama_print_timings(llama.ctx);
1669
+ return false;
1670
+ }
1671
+ }
1672
+ }
1673
+
1674
+ llama_print_timings(llama.ctx);
1675
+ sink.done();
1676
+ return true;
1677
+ };
1678
+ const auto on_complete = [&](bool) {
1679
+ llama.mutex.unlock();
1680
+ };
1681
+ lock.release();
1682
+ res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
1683
+ });
1684
+
1685
  svr.Get("/model.json", [&llama](const Request &, Response &res)
1686
  {
1687
  const json data = format_generation_settings(llama);
examples/speculative/speculative.cpp CHANGED
@@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
125
  grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
126
  }
127
 
 
 
128
  const auto t_dec_start = ggml_time_us();
129
 
130
  while (true) {
@@ -134,7 +136,7 @@ int main(int argc, char ** argv) {
134
 
135
  while (true) {
136
  // sample from the target model
137
- llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
138
 
139
  // remember which tokens were sampled - used for repetition penalties during sampling
140
  last_tokens.erase(last_tokens.begin());
@@ -172,7 +174,7 @@ int main(int argc, char ** argv) {
172
  LOG("out of drafted tokens\n");
173
  }
174
 
175
- llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
176
  llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
177
  ++n_past_dft;
178
 
@@ -211,7 +213,13 @@ int main(int argc, char ** argv) {
211
  if (grammar_dft) {
212
  llama_grammar_free(grammar_dft);
213
  }
214
- grammar_dft = llama_grammar_copy(grammar_tgt);
 
 
 
 
 
 
215
 
216
  LOG("copied target grammar to draft grammar\n");
217
  }
@@ -257,7 +265,7 @@ int main(int argc, char ** argv) {
257
  }
258
 
259
  // evaluate the drafted token on the draft model
260
- llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
261
  llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
262
  ++n_past_cur;
263
 
@@ -267,7 +275,7 @@ int main(int argc, char ** argv) {
267
  }
268
 
269
  // evaluate the target model on the drafted tokens
270
- llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
271
  llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
272
  ++n_past_tgt;
273
 
 
125
  grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
126
  }
127
 
128
+ llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
129
+
130
  const auto t_dec_start = ggml_time_us();
131
 
132
  while (true) {
 
136
 
137
  while (true) {
138
  // sample from the target model
139
+ llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
140
 
141
  // remember which tokens were sampled - used for repetition penalties during sampling
142
  last_tokens.erase(last_tokens.begin());
 
174
  LOG("out of drafted tokens\n");
175
  }
176
 
177
+ llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
178
  llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
179
  ++n_past_dft;
180
 
 
213
  if (grammar_dft) {
214
  llama_grammar_free(grammar_dft);
215
  }
216
+ // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
217
+ // that will need to change.
218
+ auto it = ctx_sampling.sequence_contexts.find(0);
219
+ GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
220
+ // This is necessary because each sequence id in sequence_contexts
221
+ // uses a copy of the original grammar.
222
+ grammar_dft = llama_grammar_copy(it->second.grammar);
223
 
224
  LOG("copied target grammar to draft grammar\n");
225
  }
 
265
  }
266
 
267
  // evaluate the drafted token on the draft model
268
+ llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
269
  llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
270
  ++n_past_cur;
271
 
 
275
  }
276
 
277
  // evaluate the target model on the drafted tokens
278
+ llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
279
  llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
280
  ++n_past_tgt;
281
 
ggml-alloc.c CHANGED
@@ -1,4 +1,5 @@
1
  #include "ggml-alloc.h"
 
2
  #include "ggml.h"
3
  #include <assert.h>
4
  #include <stdarg.h>
@@ -6,25 +7,6 @@
6
  #include <stdlib.h>
7
  #include <string.h>
8
 
9
- #ifdef __has_include
10
- #if __has_include(<unistd.h>)
11
- #include <unistd.h>
12
- #if defined(_POSIX_MAPPED_FILES)
13
- #include <sys/types.h>
14
- #include <sys/mman.h>
15
- #endif
16
- #endif
17
- #endif
18
-
19
- #if defined(_WIN32)
20
- #define WIN32_LEAN_AND_MEAN
21
- #ifndef NOMINMAX
22
- #define NOMINMAX
23
- #endif
24
- #include <windows.h>
25
- #include <memoryapi.h>
26
- #endif
27
-
28
 
29
  #define UNUSED(x) (void)(x)
30
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -80,8 +62,9 @@ struct free_block {
80
  #define MAX_FREE_BLOCKS 256
81
 
82
  struct ggml_allocr {
 
 
83
  void * data;
84
- size_t size;
85
  size_t alignment;
86
  int n_free_blocks;
87
  struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
119
  }
120
  #endif
121
 
122
- static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
123
- return ggml_nbytes(tensor);
124
-
125
- UNUSED(alloc);
126
- }
127
-
128
  // check if a tensor is allocated by this buffer
129
  static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
130
- void * ptr = tensor->data;
131
- return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
132
  }
133
 
134
  static bool ggml_is_view(struct ggml_tensor * t) {
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
136
  }
137
 
138
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
139
- #ifdef GGML_ALLOCATOR_DEBUG
140
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
141
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
142
- #endif
143
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
144
  size = aligned_offset(NULL, size, alloc->alignment);
145
 
146
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
188
 
189
  tensor->data = addr;
190
  AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
 
 
191
 
192
  #ifdef GGML_ALLOCATOR_DEBUG
193
  add_allocated_tensor(alloc, tensor);
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
208
 
209
  // this is a very naive implementation, but for our case the number of free blocks should be very small
210
  static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
211
- void * ptr = tensor->data;
212
-
213
  if (ggml_allocr_is_own(alloc, tensor) == false) {
214
  // the tensor was not allocated in this buffer
215
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
216
  // the easiest way to deal with this is just to ignore it
 
217
  return;
218
  }
219
 
220
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
 
 
221
  size = aligned_offset(NULL, size, alloc->alignment);
222
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
223
- AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
 
224
 
225
  #ifdef GGML_ALLOCATOR_DEBUG
226
  remove_allocated_tensor(alloc, tensor);
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
285
  alloc->n_free_blocks = 1;
286
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
287
  alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
288
- alloc->free_blocks[0].size = alloc->size - align_offset;
289
  }
290
 
291
  struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
292
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
 
 
293
 
294
  *alloc = (struct ggml_allocr){
295
- /*.data = */ data,
296
- /*.size = */ size,
 
297
  /*.alignment = */ alignment,
298
  /*.n_free_blocks = */ 0,
299
  /*.free_blocks = */ {{0}},
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
312
  return alloc;
313
  }
314
 
315
- // OS specific functions to allocate and free uncommitted virtual memory
316
- static void * alloc_vmem(size_t size) {
317
- #if defined(_WIN32)
318
- return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
319
- #elif defined(_POSIX_MAPPED_FILES)
320
- void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
321
- if (ptr == MAP_FAILED) {
322
- return NULL;
323
- }
324
- return ptr;
325
- #else
326
- // use a fixed address for other platforms
327
- uintptr_t base_addr = (uintptr_t)-size - 0x100;
328
- return (void *)base_addr;
329
- #endif
330
- }
331
-
332
- static void free_vmem(void * base_addr, size_t size) {
333
- #if defined(_WIN32)
334
- VirtualFree(base_addr, 0, MEM_RELEASE);
335
- UNUSED(size);
336
- #elif defined(_POSIX_MAPPED_FILES)
337
- munmap(base_addr, size);
338
- #else
339
- // nothing to do
340
- UNUSED(base_addr);
341
- UNUSED(size);
342
- #endif
343
- }
344
-
345
- // allocate uncommitted virtual memory to measure the size of the graph
346
- static void alloc_measure_vmem(void ** base_addr, size_t * size) {
347
- // 128GB for 64-bit, 1GB for 32-bit
348
- *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
349
- do {
350
- *base_addr = alloc_vmem(*size);
351
- if (*base_addr != NULL) {
352
- AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
353
- return;
354
- }
355
- // try again with half the size
356
- *size /= 2;
357
- } while (*size > 0);
358
-
359
- GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
360
- }
361
-
362
- static void free_measure_vmem(void * base_addr, size_t size) {
363
- free_vmem(base_addr, size);
364
- }
365
-
366
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
367
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
 
368
 
369
- void * base_addr;
370
- size_t size;
371
 
372
- alloc_measure_vmem(&base_addr, &size);
 
373
 
374
  *alloc = (struct ggml_allocr){
375
- /*.data = */ base_addr,
376
- /*.size = */ size,
377
- /*.alignment = */ alignment,
 
378
  /*.n_free_blocks = */ 0,
379
  /*.free_blocks = */ {{0}},
380
  /*.hash_table = */ {{0}},
381
  /*.max_size = */ 0,
382
- /*.measure = */ true,
383
  /*.parse_seq = */ {0},
384
  /*.parse_seq_len = */ 0,
385
  #ifdef GGML_ALLOCATOR_DEBUG
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
393
  }
394
 
395
  void ggml_allocr_free(struct ggml_allocr * alloc) {
396
- if (alloc->measure) {
397
- free_measure_vmem(alloc->data, alloc->size);
398
  }
399
  free(alloc);
400
  }
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
437
  case GGML_OP_ROPE:
438
  case GGML_OP_RMS_NORM:
439
  case GGML_OP_SOFT_MAX:
440
- case GGML_OP_CONT:
441
  return true;
442
 
443
  default:
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
445
  }
446
  }
447
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
449
  struct hash_node * ht = alloc->hash_table;
450
  if (node->data == NULL) {
451
  if (ggml_is_view(node)) {
452
- assert(node->view_src->data != NULL);
453
- node->data = (char *)node->view_src->data + node->view_offs;
454
  } else {
455
  // see if we can reuse a parent's buffer (inplace)
456
  if (ggml_op_can_inplace(node->op)) {
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
478
  // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
479
  // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
480
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
481
- node->data = parent->data;
 
 
482
  return;
483
  }
484
  }
485
  else {
486
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
487
- node->data = parent->data;
 
 
488
  return;
489
  }
490
  }
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
495
  }
496
  }
497
 
498
- static size_t ggml_allocr_alloc_graph_tensors_n(
499
  struct ggml_allocr * alloc,
500
  struct ggml_cgraph ** graphs, int n_graphs,
501
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
513
  if (ggml_is_view(node)) {
514
  struct ggml_tensor * view_src = node->view_src;
515
  hash_get(ht, view_src)->n_views += 1;
 
 
 
 
516
  }
517
 
518
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
521
  break;
522
  }
523
  hash_get(ht, parent)->n_children += 1;
 
 
 
524
  }
525
  }
526
  }
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
631
  }
632
 
633
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
634
- return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
635
  }
636
 
637
  size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
 
1
  #include "ggml-alloc.h"
2
+ #include "ggml-backend.h"
3
  #include "ggml.h"
4
  #include <assert.h>
5
  #include <stdarg.h>
 
7
  #include <stdlib.h>
8
  #include <string.h>
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  #define UNUSED(x) (void)(x)
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
62
  #define MAX_FREE_BLOCKS 256
63
 
64
  struct ggml_allocr {
65
+ struct ggml_backend_buffer * buffer;
66
+ bool buffer_owned;
67
  void * data;
 
68
  size_t alignment;
69
  int n_free_blocks;
70
  struct free_block free_blocks[MAX_FREE_BLOCKS];
 
102
  }
103
  #endif
104
 
 
 
 
 
 
 
105
  // check if a tensor is allocated by this buffer
106
  static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
107
+ return tensor->buffer == alloc->buffer;
 
108
  }
109
 
110
  static bool ggml_is_view(struct ggml_tensor * t) {
 
112
  }
113
 
114
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 
115
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
116
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
117
+
118
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
119
  size = aligned_offset(NULL, size, alloc->alignment);
120
 
121
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 
163
 
164
  tensor->data = addr;
165
  AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
166
+ tensor->buffer = alloc->buffer;
167
+ ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
168
 
169
  #ifdef GGML_ALLOCATOR_DEBUG
170
  add_allocated_tensor(alloc, tensor);
 
185
 
186
  // this is a very naive implementation, but for our case the number of free blocks should be very small
187
  static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 
 
188
  if (ggml_allocr_is_own(alloc, tensor) == false) {
189
  // the tensor was not allocated in this buffer
190
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
191
  // the easiest way to deal with this is just to ignore it
192
+ AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
193
  return;
194
  }
195
 
196
+ void * ptr = tensor->data;
197
+
198
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
199
  size = aligned_offset(NULL, size, alloc->alignment);
200
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
201
+
202
+ ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
203
 
204
  #ifdef GGML_ALLOCATOR_DEBUG
205
  remove_allocated_tensor(alloc, tensor);
 
264
  alloc->n_free_blocks = 1;
265
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
266
  alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
267
+ alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
268
  }
269
 
270
  struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
271
+ struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
272
+
273
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
274
 
275
  *alloc = (struct ggml_allocr){
276
+ /*.buffer = */ buffer,
277
+ /*.buffer_owned = */ true,
278
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
279
  /*.alignment = */ alignment,
280
  /*.n_free_blocks = */ 0,
281
  /*.free_blocks = */ {{0}},
 
294
  return alloc;
295
  }
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298
+ struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
299
+ alloc->measure = true;
300
 
301
+ return alloc;
302
+ }
303
 
304
+ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
305
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
306
 
307
  *alloc = (struct ggml_allocr){
308
+ /*.buffer = */ buffer,
309
+ /*.buffer_owned = */ false,
310
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
311
+ /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
312
  /*.n_free_blocks = */ 0,
313
  /*.free_blocks = */ {{0}},
314
  /*.hash_table = */ {{0}},
315
  /*.max_size = */ 0,
316
+ /*.measure = */ false,
317
  /*.parse_seq = */ {0},
318
  /*.parse_seq_len = */ 0,
319
  #ifdef GGML_ALLOCATOR_DEBUG
 
327
  }
328
 
329
  void ggml_allocr_free(struct ggml_allocr * alloc) {
330
+ if (alloc->buffer_owned) {
331
+ ggml_backend_buffer_free(alloc->buffer);
332
  }
333
  free(alloc);
334
  }
 
371
  case GGML_OP_ROPE:
372
  case GGML_OP_RMS_NORM:
373
  case GGML_OP_SOFT_MAX:
 
374
  return true;
375
 
376
  default:
 
378
  }
379
  }
380
 
381
+ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
382
+ assert(view->view_src != NULL && view->view_src->data != NULL);
383
+ view->backend = view->view_src->backend;
384
+ view->buffer = view->view_src->buffer;
385
+ view->data = (char *)view->view_src->data + view->view_offs;
386
+
387
+ // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
388
+ // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
389
+ assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
390
+ ggml_backend_buffer_init_tensor(alloc->buffer, view);
391
+ }
392
+
393
  static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
394
  struct hash_node * ht = alloc->hash_table;
395
  if (node->data == NULL) {
396
  if (ggml_is_view(node)) {
397
+ init_view(alloc, node);
 
398
  } else {
399
  // see if we can reuse a parent's buffer (inplace)
400
  if (ggml_op_can_inplace(node->op)) {
 
422
  // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
423
  // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
424
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
425
+ node->view_src = view_src;
426
+ view_src_hn->n_views += 1;
427
+ init_view(alloc, node);
428
  return;
429
  }
430
  }
431
  else {
432
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
433
+ node->view_src = parent;
434
+ p_hn->n_views += 1;
435
+ init_view(alloc, node);
436
  return;
437
  }
438
  }
 
443
  }
444
  }
445
 
446
+ size_t ggml_allocr_alloc_graph_n(
447
  struct ggml_allocr * alloc,
448
  struct ggml_cgraph ** graphs, int n_graphs,
449
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
 
461
  if (ggml_is_view(node)) {
462
  struct ggml_tensor * view_src = node->view_src;
463
  hash_get(ht, view_src)->n_views += 1;
464
+ if (node->buffer == NULL && node->data != NULL) {
465
+ // view of a pre-allocated tensor, didn't call init_view() yet
466
+ init_view(alloc, node);
467
+ }
468
  }
469
 
470
  for (int j = 0; j < GGML_MAX_SRC; j++) {
 
473
  break;
474
  }
475
  hash_get(ht, parent)->n_children += 1;
476
+ if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
477
+ init_view(alloc, parent);
478
+ }
479
  }
480
  }
481
  }
 
586
  }
587
 
588
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
589
+ return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
590
  }
591
 
592
  size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
ggml-alloc.h CHANGED
@@ -6,21 +6,27 @@
6
  extern "C" {
7
  #endif
8
 
 
9
 
10
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 
12
 
13
  // tell the allocator to parse nodes following the order described in the list
14
  // you should call this if your graph are optimized to execute out-of-order
15
  GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
 
17
- GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
- GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
19
- GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
20
- GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
21
  GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
22
- GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
23
 
 
 
 
 
24
 
25
  #ifdef __cplusplus
26
  }
 
6
  extern "C" {
7
  #endif
8
 
9
+ struct ggml_backend_buffer;
10
 
11
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
12
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
13
+ GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
14
 
15
  // tell the allocator to parse nodes following the order described in the list
16
  // you should call this if your graph are optimized to execute out-of-order
17
  GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
18
 
19
+ GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
20
+ GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
21
+ GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
22
+ GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
23
  GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
24
+ GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
25
 
26
+ GGML_API size_t ggml_allocr_alloc_graph_n(
27
+ struct ggml_allocr * alloc,
28
+ struct ggml_cgraph ** graphs, int n_graphs,
29
+ struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
30
 
31
  #ifdef __cplusplus
32
  }
ggml-backend.c ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-backend.h"
2
+ #include "ggml-alloc.h"
3
+
4
+ #include <assert.h>
5
+ #include <stdarg.h>
6
+ #include <stdio.h>
7
+ #include <stdlib.h>
8
+ #include <string.h>
9
+
10
+ #define UNUSED GGML_UNUSED
11
+
12
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
13
+
14
+ // backend buffer
15
+
16
+ ggml_backend_buffer_t ggml_backend_buffer_init(
17
+ struct ggml_backend * backend,
18
+ struct ggml_backend_buffer_i iface,
19
+ ggml_backend_buffer_context_t context,
20
+ size_t size) {
21
+ ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
22
+
23
+ GGML_ASSERT(iface.get_base != NULL);
24
+
25
+ (*buffer) = (struct ggml_backend_buffer) {
26
+ /* .interface = */ iface,
27
+ /* .backend = */ backend,
28
+ /* .context = */ context,
29
+ /* .size = */ size,
30
+ };
31
+
32
+ return buffer;
33
+ }
34
+
35
+ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
36
+ if (buffer->iface.free_buffer != NULL) {
37
+ buffer->iface.free_buffer(buffer);
38
+ }
39
+ free(buffer);
40
+ }
41
+
42
+ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
43
+ return ggml_backend_get_alignment(buffer->backend);
44
+ }
45
+
46
+ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
47
+ return buffer->iface.get_base(buffer);
48
+ }
49
+
50
+ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
51
+ return buffer->size;
52
+ }
53
+
54
+ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
55
+ if (buffer->iface.get_alloc_size) {
56
+ return buffer->iface.get_alloc_size(buffer, tensor);
57
+ }
58
+ return ggml_nbytes(tensor);
59
+ }
60
+
61
+ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
62
+ if (buffer->iface.init_tensor) {
63
+ buffer->iface.init_tensor(buffer, tensor);
64
+ }
65
+ }
66
+
67
+ void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
68
+ if (buffer->iface.free_tensor) {
69
+ buffer->iface.free_tensor(buffer, tensor);
70
+ }
71
+ }
72
+
73
+ // backend
74
+
75
+ ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
76
+ return tensor->buffer->backend;
77
+ }
78
+
79
+ const char * ggml_backend_name(ggml_backend_t backend) {
80
+ return backend->iface.get_name(backend);
81
+ }
82
+
83
+ void ggml_backend_free(ggml_backend_t backend) {
84
+ backend->iface.free(backend);
85
+ }
86
+
87
+ ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
88
+ return backend->iface.alloc_buffer(backend, size);
89
+ }
90
+
91
+ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
92
+ return backend->iface.get_alignment(backend);
93
+ }
94
+
95
+ void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
96
+ ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
97
+ }
98
+
99
+ void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
100
+ ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
101
+ }
102
+
103
+ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
104
+ ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
105
+ ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
106
+ }
107
+
108
+ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
109
+ ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
110
+ ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
111
+ }
112
+
113
+ void ggml_backend_synchronize(ggml_backend_t backend) {
114
+ backend->iface.synchronize(backend);
115
+ }
116
+
117
+ ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
118
+ return backend->iface.graph_plan_create(backend, cgraph);
119
+ }
120
+
121
+ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
122
+ backend->iface.graph_plan_free(backend, plan);
123
+ }
124
+
125
+ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
126
+ backend->iface.graph_plan_compute(backend, plan);
127
+ }
128
+
129
+ void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
130
+ backend->iface.graph_compute(backend, cgraph);
131
+ }
132
+
133
+ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
134
+ return backend->iface.supports_op(backend, op);
135
+ }
136
+
137
+ // backend copy
138
+
139
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
140
+ if (a->type != b->type) {
141
+ return false;
142
+ }
143
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
144
+ if (a->ne[i] != b->ne[i]) {
145
+ return false;
146
+ }
147
+ if (a->nb[i] != b->nb[i]) {
148
+ return false;
149
+ }
150
+ }
151
+ return true;
152
+ }
153
+
154
+ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
155
+ //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
156
+ //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
157
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
158
+
159
+ // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
160
+
161
+ if (src == dst) {
162
+ return;
163
+ }
164
+
165
+ // TODO: allow backends to support copy to/from same backend
166
+
167
+ if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
168
+ ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
169
+ } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
170
+ ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
171
+ } else {
172
+ // shouldn't be hit when copying from/to CPU
173
+ #ifndef NDEBUG
174
+ fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
175
+ #endif
176
+ size_t nbytes = ggml_nbytes(src);
177
+ void * data = malloc(nbytes);
178
+ ggml_backend_tensor_get(src, data, 0, nbytes);
179
+ ggml_backend_tensor_set(dst, data, 0, nbytes);
180
+ free(data);
181
+ }
182
+ }
183
+
184
+ // backend CPU
185
+
186
+ struct ggml_backend_cpu_context {
187
+ int n_threads;
188
+ void * work_data;
189
+ size_t work_size;
190
+ };
191
+
192
+ static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
193
+ return "CPU";
194
+
195
+ UNUSED(backend);
196
+ }
197
+
198
+ static void ggml_backend_cpu_free(ggml_backend_t backend) {
199
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
200
+ free(cpu_ctx->work_data);
201
+ free(cpu_ctx);
202
+ free(backend);
203
+ }
204
+
205
+ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
206
+ return (void *)buffer->context;
207
+ }
208
+
209
+ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
210
+ free(buffer->context);
211
+ UNUSED(buffer);
212
+ }
213
+
214
+ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
215
+ /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
216
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
217
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
218
+ /* .init_tensor = */ NULL, // no initialization required
219
+ /* .free_tensor = */ NULL, // no cleanup required
220
+ };
221
+
222
+ // for buffers from ptr, free is not called
223
+ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
224
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
225
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
226
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
227
+ /* .init_tensor = */ NULL,
228
+ /* .free_tensor = */ NULL,
229
+ };
230
+
231
+ static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
232
+
233
+ static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
234
+ size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
235
+ void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
236
+
237
+ return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
238
+ }
239
+
240
+ static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
241
+ return TENSOR_ALIGNMENT;
242
+ UNUSED(backend);
243
+ }
244
+
245
+ static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
246
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
247
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
248
+
249
+ memcpy((char *)tensor->data + offset, data, size);
250
+
251
+ UNUSED(backend);
252
+ }
253
+
254
+ static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
255
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
256
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
257
+
258
+ memcpy(data, (const char *)tensor->data + offset, size);
259
+
260
+ UNUSED(backend);
261
+ }
262
+
263
+ static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
264
+ UNUSED(backend);
265
+ }
266
+
267
+ static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
268
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
269
+
270
+ UNUSED(backend);
271
+ }
272
+
273
+ static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
274
+ // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
275
+ ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
276
+
277
+ UNUSED(backend);
278
+ }
279
+
280
+ struct ggml_backend_plan_cpu {
281
+ struct ggml_cplan cplan;
282
+ struct ggml_cgraph cgraph;
283
+ };
284
+
285
+ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
286
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
287
+
288
+ struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
289
+
290
+ cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
291
+ cpu_plan->cgraph = *cgraph;
292
+
293
+ if (cpu_plan->cplan.work_size > 0) {
294
+ cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
295
+ }
296
+
297
+ return cpu_plan;
298
+ }
299
+
300
+ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
301
+ struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
302
+
303
+ free(cpu_plan->cplan.work_data);
304
+ free(cpu_plan);
305
+
306
+ UNUSED(backend);
307
+ }
308
+
309
+ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
310
+ struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
311
+
312
+ ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
313
+
314
+ UNUSED(backend);
315
+ }
316
+
317
+ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
318
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
319
+
320
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
321
+
322
+ if (cpu_ctx->work_size < cplan.work_size) {
323
+ // TODO: may be faster to free and use malloc to avoid the copy
324
+ cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
325
+ cpu_ctx->work_size = cplan.work_size;
326
+ }
327
+
328
+ cplan.work_data = cpu_ctx->work_data;
329
+
330
+ ggml_graph_compute(cgraph, &cplan);
331
+ }
332
+
333
+ static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
334
+ return true;
335
+ UNUSED(backend);
336
+ UNUSED(op);
337
+ }
338
+
339
+ static struct ggml_backend_i cpu_backend_i = {
340
+ /* .get_name = */ ggml_backend_cpu_name,
341
+ /* .free = */ ggml_backend_cpu_free,
342
+ /* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer,
343
+ /* .get_alignment = */ ggml_backend_cpu_get_alignment,
344
+ /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async,
345
+ /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async,
346
+ /* .synchronize = */ ggml_backend_cpu_synchronize,
347
+ /* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from,
348
+ /* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to,
349
+ /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
350
+ /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
351
+ /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
352
+ /* .graph_compute = */ ggml_backend_cpu_graph_compute,
353
+ /* .supports_op = */ ggml_backend_cpu_supports_op,
354
+ };
355
+
356
+ ggml_backend_t ggml_backend_cpu_init(void) {
357
+ struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
358
+
359
+ ctx->n_threads = GGML_DEFAULT_N_THREADS;
360
+ ctx->work_data = NULL;
361
+ ctx->work_size = 0;
362
+
363
+ ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
364
+
365
+ *cpu_backend = (struct ggml_backend) {
366
+ /* .interface = */ cpu_backend_i,
367
+ /* .context = */ ctx
368
+ };
369
+ return cpu_backend;
370
+ }
371
+
372
+ bool ggml_backend_is_cpu(ggml_backend_t backend) {
373
+ return backend->iface.get_name == ggml_backend_cpu_name;
374
+ }
375
+
376
+ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
377
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
378
+
379
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
380
+ ctx->n_threads = n_threads;
381
+ }
382
+
383
+ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
384
+ return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
385
+ }
ggml-backend.h ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif
8
+ struct ggml_backend;
9
+ struct ggml_backend_buffer;
10
+
11
+ // type-erased backend-specific types / wrappers
12
+ typedef void * ggml_backend_context_t;
13
+ typedef void * ggml_backend_graph_plan_t;
14
+ typedef void * ggml_backend_buffer_context_t;
15
+
16
+ // avoid accessing internals of these types
17
+ typedef struct ggml_backend * ggml_backend_t;
18
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
19
+
20
+ //
21
+ // backend buffer
22
+ //
23
+
24
+ struct ggml_backend_buffer_i {
25
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
26
+ void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
27
+ size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
28
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
29
+ void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
30
+ };
31
+
32
+ // TODO: hide behind API
33
+ struct ggml_backend_buffer {
34
+ struct ggml_backend_buffer_i iface;
35
+
36
+ ggml_backend_t backend;
37
+ ggml_backend_buffer_context_t context;
38
+
39
+ size_t size;
40
+ };
41
+
42
+ // backend buffer functions
43
+ GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
44
+ struct ggml_backend * backend,
45
+ struct ggml_backend_buffer_i iface,
46
+ ggml_backend_buffer_context_t context,
47
+ size_t size);
48
+
49
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
50
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
51
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
52
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
53
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
54
+ GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
55
+ GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
56
+
57
+ //
58
+ // backend
59
+ //
60
+
61
+ struct ggml_backend_i {
62
+ const char * (*get_name)(ggml_backend_t backend);
63
+
64
+ void (*free)(ggml_backend_t backend);
65
+
66
+ // buffer allocation
67
+ ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
68
+
69
+ // get buffer alignment
70
+ size_t (*get_alignment)(ggml_backend_t backend);
71
+
72
+ // tensor data access
73
+ // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
74
+ void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
75
+ void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
76
+ void (*synchronize) (ggml_backend_t backend);
77
+
78
+ // (optional) copy tensor between different backends, allow for single-copy tranfers
79
+ void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
80
+ void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
81
+
82
+ // compute graph with a plan
83
+ ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
84
+ void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
85
+ void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
86
+
87
+ // compute graph without a plan
88
+ void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
89
+
90
+ // check if the backend supports an operation
91
+ bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
92
+ };
93
+
94
+ // TODO: hide behind API
95
+ struct ggml_backend {
96
+ struct ggml_backend_i iface;
97
+
98
+ ggml_backend_context_t context;
99
+ };
100
+
101
+ // backend helper functions
102
+ GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
103
+
104
+ GGML_API const char * ggml_backend_name(ggml_backend_t backend);
105
+ GGML_API void ggml_backend_free(ggml_backend_t backend);
106
+
107
+ GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
108
+
109
+ GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
110
+
111
+ GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
112
+ GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
113
+
114
+ GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
115
+ GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
116
+
117
+ GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
118
+
119
+ GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
120
+
121
+ GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
122
+ GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
123
+ GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
124
+ GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
125
+
126
+ // tensor copy between different backends
127
+ GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
128
+
129
+ //
130
+ // CPU backend
131
+ //
132
+
133
+ GGML_API ggml_backend_t ggml_backend_cpu_init(void);
134
+
135
+ GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
136
+
137
+ GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
138
+
139
+ GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
140
+
141
+ #ifdef __cplusplus
142
+ }
143
+ #endif
ggml-cuda.cu CHANGED
@@ -62,6 +62,7 @@
62
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
63
  #define cudaMemcpyKind hipMemcpyKind
64
  #define cudaMemset hipMemset
 
65
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
66
  #define cudaSetDevice hipSetDevice
67
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
414
  #define CUDA_SILU_BLOCK_SIZE 256
415
  #define CUDA_CPY_BLOCK_SIZE 32
416
  #define CUDA_SCALE_BLOCK_SIZE 256
 
417
  #define CUDA_ROPE_BLOCK_SIZE 256
418
  #define CUDA_ALIBI_BLOCK_SIZE 32
419
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
420
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
421
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 
422
 
423
  // dmmv = dequantize_mul_mat_vec
424
  #ifndef GGML_CUDA_DMMV_X
@@ -1574,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1574
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1575
  }
1576
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1577
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1578
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1579
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -4555,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
4555
  dst[i] = scale * x[i];
4556
  }
4557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4558
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4559
  const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4560
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5436,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5436
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5437
  }
5438
 
 
 
 
 
 
5439
  template<typename T>
5440
  static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5441
  const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@@ -5699,7 +5753,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5699
  } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5700
  GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5701
  kind = cudaMemcpyDeviceToDevice;
5702
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5703
  int id;
5704
  CUDA_CHECK(cudaGetDevice(&id));
5705
  src_ptr = (char *) extra->data_device[id];
@@ -5735,6 +5789,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5735
  }
5736
  }
5737
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5738
  inline void ggml_cuda_op_add(
5739
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5740
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6275,12 +6430,12 @@ inline void ggml_cuda_op_alibi(
6275
  const int64_t ne02 = src0->ne[2];
6276
  const int64_t nrows = ggml_nrows(src0);
6277
 
6278
- const int n_past = ((int32_t *) dst->op_params)[0];
6279
  const int n_head = ((int32_t *) dst->op_params)[1];
6280
  float max_bias;
6281
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
6282
 
6283
- GGML_ASSERT(ne01 + n_past == ne00);
6284
  GGML_ASSERT(n_head == ne02);
6285
 
6286
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -6339,7 +6494,14 @@ inline void ggml_cuda_op_scale(
6339
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6340
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6341
 
6342
- const float scale = ((float *) src1->data)[0];
 
 
 
 
 
 
 
6343
 
6344
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
6345
  CUDA_CHECK(cudaGetLastError());
@@ -6349,6 +6511,24 @@ inline void ggml_cuda_op_scale(
6349
  (void) src1_dd;
6350
  }
6351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6352
  static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6353
  const int64_t nrows0 = ggml_nrows(src0);
6354
 
@@ -6358,9 +6538,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6358
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6359
  GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6360
 
6361
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6362
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6363
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6364
 
6365
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6366
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6501,9 +6681,9 @@ static void ggml_cuda_op_mul_mat(
6501
  const size_t q8_1_ts = sizeof(block_q8_1);
6502
  const size_t q8_1_bs = QK8_1;
6503
 
6504
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6505
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6506
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6507
 
6508
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6509
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6581,7 +6761,7 @@ static void ggml_cuda_op_mul_mat(
6581
  if (convert_src1_to_q8_1) {
6582
  src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6583
 
6584
- if (split && src1_on_device && src1_is_contiguous) {
6585
  quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6586
  CUDA_CHECK(cudaGetLastError());
6587
  }
@@ -6663,7 +6843,7 @@ static void ggml_cuda_op_mul_mat(
6663
  GGML_ASSERT(false);
6664
  }
6665
 
6666
- if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6667
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6668
  CUDA_CHECK(cudaGetLastError());
6669
  }
@@ -6754,6 +6934,14 @@ static void ggml_cuda_op_mul_mat(
6754
  }
6755
  }
6756
 
 
 
 
 
 
 
 
 
6757
  static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6758
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6759
  }
@@ -6808,13 +6996,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
6808
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6809
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6810
 
6811
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6812
  void * src0_ddq = src0_extra->data_device[g_main_device];
6813
 
6814
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6815
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6816
 
6817
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6818
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6819
 
6820
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -6839,13 +7027,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
6839
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6840
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6841
 
6842
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6843
  void * src0_ddq = src0_extra->data_device[g_main_device];
6844
 
6845
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6846
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6847
 
6848
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6849
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6850
 
6851
  const int64_t row_stride_x = nb01 / sizeof(half);
@@ -6866,11 +7054,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
6866
  }
6867
  }
6868
 
6869
- if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6870
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6871
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6872
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6873
- }else if (src0->type == GGML_TYPE_F32) {
6874
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6875
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6876
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -6902,6 +7090,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
6902
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6903
  }
6904
 
 
 
 
 
6905
  static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6906
  const int64_t ne = ggml_nelements(src0);
6907
  GGML_ASSERT(ne == ggml_nelements(src1));
@@ -6931,8 +7123,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
6931
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6932
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6933
 
6934
- const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6935
- const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6936
 
6937
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6938
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -6987,8 +7179,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6987
 
6988
  const size_t nb1 = tensor->nb[1];
6989
 
6990
- ggml_backend backend = tensor->backend;
6991
- struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6992
  memset(extra, 0, sizeof(*extra));
6993
 
6994
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -7042,7 +7234,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7042
  CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
7043
  }
7044
 
7045
-
7046
  CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
7047
 
7048
  extra->data_device[id] = buf;
@@ -7081,17 +7272,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
7081
  delete extra;
7082
  }
7083
 
7084
- static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7085
  static size_t g_temp_tensor_extra_index = 0;
7086
 
7087
- static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7088
  if (g_temp_tensor_extras == nullptr) {
7089
  g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7090
  }
7091
 
7092
  size_t alloc_index = g_temp_tensor_extra_index;
7093
  g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7094
- struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7095
  memset(extra, 0, sizeof(*extra));
7096
 
7097
  return extra;
@@ -7119,7 +7310,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7119
  return;
7120
  }
7121
 
7122
- struct ggml_tensor_extra_gpu * extra;
7123
 
7124
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7125
  tensor->op == GGML_OP_VIEW ||
@@ -7128,7 +7319,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7128
 
7129
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7130
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7131
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7132
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7133
  size_t offset = 0;
7134
  if (tensor->op == GGML_OP_VIEW) {
@@ -7137,7 +7328,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7137
  extra = ggml_cuda_alloc_temp_tensor_extra();
7138
  extra->data_device[g_main_device] = src0_ddc + offset;
7139
  } else if (tensor->op == GGML_OP_CPY) {
7140
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7141
  void * src1_ddv = src1_extra->data_device[g_main_device];
7142
  extra = ggml_cuda_alloc_temp_tensor_extra();
7143
  extra->data_device[g_main_device] = src1_ddv;
@@ -7179,13 +7370,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
7179
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
7180
  }
7181
 
7182
- struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7183
 
7184
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7185
  tensor->op == GGML_OP_VIEW;
7186
 
7187
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7188
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7189
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7190
  size_t view_offset = 0;
7191
  if (tensor->op == GGML_OP_VIEW) {
@@ -7203,7 +7394,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7203
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7204
  GGML_ASSERT(ggml_is_contiguous(tensor));
7205
 
7206
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7207
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7208
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7209
  }
@@ -7260,58 +7451,47 @@ void ggml_cuda_free_scratch() {
7260
  g_scratch_buffer = nullptr;
7261
  }
7262
 
7263
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
7264
  ggml_cuda_func_t func;
7265
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7266
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
7267
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
7268
 
 
 
 
 
7269
  switch (tensor->op) {
 
 
 
 
 
 
7270
  case GGML_OP_DUP:
7271
- if (!any_on_device) {
7272
- return false;
7273
- }
7274
  func = ggml_cuda_dup;
7275
  break;
7276
  case GGML_OP_ADD:
7277
- if (!any_on_device) {
7278
- return false;
7279
- }
7280
  func = ggml_cuda_add;
7281
  break;
7282
  case GGML_OP_MUL:
7283
- if (!any_on_device) {
7284
- return false;
7285
- }
7286
  func = ggml_cuda_mul;
7287
  break;
7288
  case GGML_OP_UNARY:
7289
  switch (ggml_get_unary_op(tensor)) {
7290
  case GGML_UNARY_OP_GELU:
7291
- if (!any_on_device) {
7292
- return false;
7293
- }
7294
  func = ggml_cuda_gelu;
7295
  break;
7296
  case GGML_UNARY_OP_SILU:
7297
- if (!any_on_device) {
7298
- return false;
7299
- }
7300
  func = ggml_cuda_silu;
7301
  break;
7302
  default:
7303
  return false;
7304
  } break;
7305
  case GGML_OP_NORM:
7306
- if (!any_on_device) {
7307
- return false;
7308
- }
7309
  func = ggml_cuda_norm;
7310
  break;
7311
  case GGML_OP_RMS_NORM:
7312
- if (!any_on_device) {
7313
- return false;
7314
- }
7315
  func = ggml_cuda_rms_norm;
7316
  break;
7317
  case GGML_OP_MUL_MAT:
@@ -7321,54 +7501,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7321
  func = ggml_cuda_mul_mat;
7322
  break;
7323
  case GGML_OP_SCALE:
7324
- if (!any_on_device) {
7325
- return false;
7326
- }
7327
  func = ggml_cuda_scale;
7328
  break;
7329
- case GGML_OP_CPY:
7330
  if (!any_on_device) {
7331
  return false;
7332
  }
 
 
 
7333
  func = ggml_cuda_cpy;
7334
  break;
7335
  case GGML_OP_CONT:
7336
- if (!any_on_device) {
7337
- return false;
7338
- }
7339
  func = ggml_cuda_dup;
7340
  break;
7341
  case GGML_OP_RESHAPE:
7342
  case GGML_OP_VIEW:
7343
  case GGML_OP_PERMUTE:
7344
  case GGML_OP_TRANSPOSE:
7345
- if (!any_on_device) {
7346
- return false;
7347
- }
7348
  func = ggml_cuda_nop;
7349
  break;
7350
  case GGML_OP_DIAG_MASK_INF:
7351
- if (!any_on_device) {
7352
- return false;
7353
- }
7354
  func = ggml_cuda_diag_mask_inf;
7355
  break;
7356
  case GGML_OP_SOFT_MAX:
7357
- if (!any_on_device) {
7358
- return false;
7359
- }
7360
  func = ggml_cuda_soft_max;
7361
  break;
7362
  case GGML_OP_ROPE:
7363
- if (!any_on_device) {
7364
- return false;
7365
- }
7366
  func = ggml_cuda_rope;
7367
  break;
7368
  case GGML_OP_ALIBI:
7369
- if (!any_on_device) {
7370
- return false;
7371
- }
7372
  func = ggml_cuda_alibi;
7373
  break;
7374
  default:
@@ -7396,3 +7558,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
7396
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
7397
  snprintf(description, description_size, "%s", prop.name);
7398
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
63
  #define cudaMemcpyKind hipMemcpyKind
64
  #define cudaMemset hipMemset
65
+ #define cudaMemsetAsync hipMemsetAsync
66
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
67
  #define cudaSetDevice hipSetDevice
68
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 
415
  #define CUDA_SILU_BLOCK_SIZE 256
416
  #define CUDA_CPY_BLOCK_SIZE 32
417
  #define CUDA_SCALE_BLOCK_SIZE 256
418
+ #define CUDA_CLAMP_BLOCK_SIZE 256
419
  #define CUDA_ROPE_BLOCK_SIZE 256
420
  #define CUDA_ALIBI_BLOCK_SIZE 32
421
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
422
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
423
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
424
+ #define CUDA_GET_ROWS_BLOCK_SIZE 256
425
 
426
  // dmmv = dequantize_mul_mat_vec
427
  #ifndef GGML_CUDA_DMMV_X
 
1577
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1578
  }
1579
 
1580
+ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1581
+ static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1582
+ const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1583
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
1584
+
1585
+ if (col >= ncols) {
1586
+ return;
1587
+ }
1588
+
1589
+ const int r = y[row];
1590
+
1591
+ // copy x[r*ncols + col] to dst[row*ncols + col]
1592
+ const int xi = r*ncols + col;
1593
+ const int di = row*ncols + col;
1594
+
1595
+ const int ib = xi/qk; // block index
1596
+ const int iqs = (xi%qk)/qr; // quant index
1597
+ const int iybs = di - di%qk; // y block start index
1598
+ const int y_offset = qr == 1 ? 1 : qk/2;
1599
+
1600
+ // dequantize
1601
+ dfloat2 v;
1602
+ dequantize_kernel(x, ib, iqs, v);
1603
+
1604
+ dst[iybs + iqs + 0] = v.x;
1605
+ dst[iybs + iqs + y_offset] = v.y;
1606
+ }
1607
+
1608
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1609
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1610
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
 
4586
  dst[i] = scale * x[i];
4587
  }
4588
 
4589
+ static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4590
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
4591
+
4592
+ if (i >= k) {
4593
+ return;
4594
+ }
4595
+
4596
+ dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4597
+ }
4598
+
4599
+ template<int qk, int qr, dequantize_kernel_t dq>
4600
+ static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4601
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
4602
+ const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
4603
+ const dim3 block_nums(block_num_x, nrows, 1);
4604
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4605
+ }
4606
+
4607
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4608
  const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4609
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 
5485
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5486
  }
5487
 
5488
+ static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
5489
+ const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
5490
+ clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
5491
+ }
5492
+
5493
  template<typename T>
5494
  static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5495
  const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
 
5753
  } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5754
  GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5755
  kind = cudaMemcpyDeviceToDevice;
5756
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5757
  int id;
5758
  CUDA_CHECK(cudaGetDevice(&id));
5759
  src_ptr = (char *) extra->data_device[id];
 
5789
  }
5790
  }
5791
 
5792
+ static void ggml_cuda_op_repeat(
5793
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5794
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5795
+ // guaranteed to be an integer due to the check in ggml_can_repeat
5796
+ const int64_t ne0 = dst->ne[0];
5797
+ const int64_t ne1 = dst->ne[1];
5798
+ const int64_t ne2 = dst->ne[2];
5799
+ const int64_t ne3 = dst->ne[3];
5800
+
5801
+ const int64_t ne00 = src0->ne[0];
5802
+ const int64_t ne01 = src0->ne[1];
5803
+ const int64_t ne02 = src0->ne[2];
5804
+ const int64_t ne03 = src0->ne[3];
5805
+
5806
+ const size_t nb0 = dst->nb[0];
5807
+ const size_t nb1 = dst->nb[1];
5808
+ const size_t nb2 = dst->nb[2];
5809
+ const size_t nb3 = dst->nb[3];
5810
+
5811
+ const size_t nb00 = src0->nb[0];
5812
+ const size_t nb01 = src0->nb[1];
5813
+ const size_t nb02 = src0->nb[2];
5814
+ const size_t nb03 = src0->nb[3];
5815
+
5816
+ const int nr0 = (int)(ne0/ne00);
5817
+ const int nr1 = (int)(ne1/ne01);
5818
+ const int nr2 = (int)(ne2/ne02);
5819
+ const int nr3 = (int)(ne3/ne03);
5820
+
5821
+ // TODO: support for transposed / permuted tensors
5822
+ GGML_ASSERT(nb0 == sizeof(float));
5823
+ GGML_ASSERT(nb00 == sizeof(float));
5824
+
5825
+ // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
5826
+ for (int i3 = 0; i3 < nr3; i3++) {
5827
+ for (int k3 = 0; k3 < ne03; k3++) {
5828
+ for (int i2 = 0; i2 < nr2; i2++) {
5829
+ for (int k2 = 0; k2 < ne02; k2++) {
5830
+ for (int i1 = 0; i1 < nr1; i1++) {
5831
+ for (int k1 = 0; k1 < ne01; k1++) {
5832
+ for (int i0 = 0; i0 < nr0; i0++) {
5833
+ CUDA_CHECK(cudaMemcpyAsync(
5834
+ (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
5835
+ (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
5836
+ ne00*nb0, cudaMemcpyDeviceToDevice, stream));
5837
+ }
5838
+ }
5839
+ }
5840
+ }
5841
+ }
5842
+ }
5843
+ }
5844
+
5845
+ (void) src1;
5846
+ (void) src1_d;
5847
+ }
5848
+
5849
+ static void ggml_cuda_op_get_rows(
5850
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5851
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5852
+
5853
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
5854
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
5855
+ GGML_ASSERT(ggml_is_contiguous(src0));
5856
+ GGML_ASSERT(ggml_is_contiguous(src1));
5857
+ GGML_ASSERT(ggml_is_contiguous(dst));
5858
+
5859
+ const int ncols = src0->ne[0];
5860
+ const int nrows = ggml_nelements(src1);
5861
+
5862
+ const int32_t * src1_i32 = (const int32_t *) src1_d;
5863
+
5864
+ switch (src0->type) {
5865
+ case GGML_TYPE_F16:
5866
+ get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5867
+ break;
5868
+ case GGML_TYPE_F32:
5869
+ get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5870
+ break;
5871
+ case GGML_TYPE_Q4_0:
5872
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5873
+ break;
5874
+ case GGML_TYPE_Q4_1:
5875
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5876
+ break;
5877
+ case GGML_TYPE_Q5_0:
5878
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5879
+ break;
5880
+ case GGML_TYPE_Q5_1:
5881
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5882
+ break;
5883
+ case GGML_TYPE_Q8_0:
5884
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5885
+ break;
5886
+ default:
5887
+ // TODO: k-quants
5888
+ GGML_ASSERT(false);
5889
+ break;
5890
+ }
5891
+ }
5892
+
5893
  inline void ggml_cuda_op_add(
5894
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5895
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
 
6430
  const int64_t ne02 = src0->ne[2];
6431
  const int64_t nrows = ggml_nrows(src0);
6432
 
6433
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6434
  const int n_head = ((int32_t *) dst->op_params)[1];
6435
  float max_bias;
6436
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
6437
 
6438
+ //GGML_ASSERT(ne01 + n_past == ne00);
6439
  GGML_ASSERT(n_head == ne02);
6440
 
6441
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
 
6494
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6495
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6496
 
6497
+ float scale;
6498
+ // HACK: support for ggml backend interface
6499
+ if (src1->backend == GGML_BACKEND_CPU) {
6500
+ scale = ((float *) src1->data)[0];
6501
+ } else {
6502
+ // TODO: pass pointer to kernel instead of copying to host
6503
+ CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
6504
+ }
6505
 
6506
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
6507
  CUDA_CHECK(cudaGetLastError());
 
6511
  (void) src1_dd;
6512
  }
6513
 
6514
+ inline void ggml_cuda_op_clamp(
6515
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6516
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6517
+
6518
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6519
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
6520
+
6521
+ const float min = ((float *) dst->op_params)[0];
6522
+ const float max = ((float *) dst->op_params)[1];
6523
+
6524
+ clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6525
+ CUDA_CHECK(cudaGetLastError());
6526
+
6527
+ (void) src1;
6528
+ (void) dst;
6529
+ (void) src1_dd;
6530
+ }
6531
+
6532
  static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6533
  const int64_t nrows0 = ggml_nrows(src0);
6534
 
 
6538
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6539
  GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6540
 
6541
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6542
+ ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6543
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6544
 
6545
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6546
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
 
6681
  const size_t q8_1_ts = sizeof(block_q8_1);
6682
  const size_t q8_1_bs = QK8_1;
6683
 
6684
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6685
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6686
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6687
 
6688
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6689
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
 
6761
  if (convert_src1_to_q8_1) {
6762
  src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6763
 
6764
+ if (src1_on_device && src1_is_contiguous) {
6765
  quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6766
  CUDA_CHECK(cudaGetLastError());
6767
  }
 
6843
  GGML_ASSERT(false);
6844
  }
6845
 
6846
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
6847
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6848
  CUDA_CHECK(cudaGetLastError());
6849
  }
 
6934
  }
6935
  }
6936
 
6937
+ static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6938
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
6939
+ }
6940
+
6941
+ static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6942
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
6943
+ }
6944
+
6945
  static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6946
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6947
  }
 
6996
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6997
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6998
 
6999
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7000
  void * src0_ddq = src0_extra->data_device[g_main_device];
7001
 
7002
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7003
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
7004
 
7005
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7006
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
7007
 
7008
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 
7027
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7028
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7029
 
7030
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7031
  void * src0_ddq = src0_extra->data_device[g_main_device];
7032
 
7033
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7034
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
7035
 
7036
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7037
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
7038
 
7039
  const int64_t row_stride_x = nb01 / sizeof(half);
 
7054
  }
7055
  }
7056
 
7057
+ if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7058
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7059
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
7060
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7061
+ } else if (src0->type == GGML_TYPE_F32) {
7062
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7063
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
7064
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
 
7090
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
7091
  }
7092
 
7093
+ static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7094
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
7095
+ }
7096
+
7097
  static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7098
  const int64_t ne = ggml_nelements(src0);
7099
  GGML_ASSERT(ne == ggml_nelements(src1));
 
7123
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7124
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7125
 
7126
+ const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7127
+ const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7128
 
7129
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7130
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
 
7179
 
7180
  const size_t nb1 = tensor->nb[1];
7181
 
7182
+ ggml_backend_type backend = tensor->backend;
7183
+ ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
7184
  memset(extra, 0, sizeof(*extra));
7185
 
7186
  for (int64_t id = 0; id < g_device_count; ++id) {
 
7234
  CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
7235
  }
7236
 
 
7237
  CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
7238
 
7239
  extra->data_device[id] = buf;
 
7272
  delete extra;
7273
  }
7274
 
7275
+ static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7276
  static size_t g_temp_tensor_extra_index = 0;
7277
 
7278
+ static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7279
  if (g_temp_tensor_extras == nullptr) {
7280
  g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7281
  }
7282
 
7283
  size_t alloc_index = g_temp_tensor_extra_index;
7284
  g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7285
+ ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7286
  memset(extra, 0, sizeof(*extra));
7287
 
7288
  return extra;
 
7310
  return;
7311
  }
7312
 
7313
+ ggml_tensor_extra_gpu * extra;
7314
 
7315
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7316
  tensor->op == GGML_OP_VIEW ||
 
7319
 
7320
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7321
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7322
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7323
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7324
  size_t offset = 0;
7325
  if (tensor->op == GGML_OP_VIEW) {
 
7328
  extra = ggml_cuda_alloc_temp_tensor_extra();
7329
  extra->data_device[g_main_device] = src0_ddc + offset;
7330
  } else if (tensor->op == GGML_OP_CPY) {
7331
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7332
  void * src1_ddv = src1_extra->data_device[g_main_device];
7333
  extra = ggml_cuda_alloc_temp_tensor_extra();
7334
  extra->data_device[g_main_device] = src1_ddv;
 
7370
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
7371
  }
7372
 
7373
+ ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7374
 
7375
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7376
  tensor->op == GGML_OP_VIEW;
7377
 
7378
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7379
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7380
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7381
  size_t view_offset = 0;
7382
  if (tensor->op == GGML_OP_VIEW) {
 
7394
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7395
  GGML_ASSERT(ggml_is_contiguous(tensor));
7396
 
7397
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7398
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7399
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7400
  }
 
7451
  g_scratch_buffer = nullptr;
7452
  }
7453
 
7454
+ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7455
  ggml_cuda_func_t func;
7456
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7457
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
7458
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
7459
 
7460
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
7461
+ return false;
7462
+ }
7463
+
7464
  switch (tensor->op) {
7465
+ case GGML_OP_REPEAT:
7466
+ func = ggml_cuda_repeat;
7467
+ break;
7468
+ case GGML_OP_GET_ROWS:
7469
+ func = ggml_cuda_get_rows;
7470
+ break;
7471
  case GGML_OP_DUP:
 
 
 
7472
  func = ggml_cuda_dup;
7473
  break;
7474
  case GGML_OP_ADD:
 
 
 
7475
  func = ggml_cuda_add;
7476
  break;
7477
  case GGML_OP_MUL:
 
 
 
7478
  func = ggml_cuda_mul;
7479
  break;
7480
  case GGML_OP_UNARY:
7481
  switch (ggml_get_unary_op(tensor)) {
7482
  case GGML_UNARY_OP_GELU:
 
 
 
7483
  func = ggml_cuda_gelu;
7484
  break;
7485
  case GGML_UNARY_OP_SILU:
 
 
 
7486
  func = ggml_cuda_silu;
7487
  break;
7488
  default:
7489
  return false;
7490
  } break;
7491
  case GGML_OP_NORM:
 
 
 
7492
  func = ggml_cuda_norm;
7493
  break;
7494
  case GGML_OP_RMS_NORM:
 
 
 
7495
  func = ggml_cuda_rms_norm;
7496
  break;
7497
  case GGML_OP_MUL_MAT:
 
7501
  func = ggml_cuda_mul_mat;
7502
  break;
7503
  case GGML_OP_SCALE:
 
 
 
7504
  func = ggml_cuda_scale;
7505
  break;
7506
+ case GGML_OP_CLAMP:
7507
  if (!any_on_device) {
7508
  return false;
7509
  }
7510
+ func = ggml_cuda_clamp;
7511
+ break;
7512
+ case GGML_OP_CPY:
7513
  func = ggml_cuda_cpy;
7514
  break;
7515
  case GGML_OP_CONT:
 
 
 
7516
  func = ggml_cuda_dup;
7517
  break;
7518
  case GGML_OP_RESHAPE:
7519
  case GGML_OP_VIEW:
7520
  case GGML_OP_PERMUTE:
7521
  case GGML_OP_TRANSPOSE:
 
 
 
7522
  func = ggml_cuda_nop;
7523
  break;
7524
  case GGML_OP_DIAG_MASK_INF:
 
 
 
7525
  func = ggml_cuda_diag_mask_inf;
7526
  break;
7527
  case GGML_OP_SOFT_MAX:
 
 
 
7528
  func = ggml_cuda_soft_max;
7529
  break;
7530
  case GGML_OP_ROPE:
 
 
 
7531
  func = ggml_cuda_rope;
7532
  break;
7533
  case GGML_OP_ALIBI:
 
 
 
7534
  func = ggml_cuda_alibi;
7535
  break;
7536
  default:
 
7558
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
7559
  snprintf(description, description_size, "%s", prop.name);
7560
  }
7561
+
7562
+ ////////////////////////////////////////////////////////////////////////////////
7563
+
7564
+ // backend interface
7565
+
7566
+ #define UNUSED GGML_UNUSED
7567
+
7568
+ struct ggml_backend_context_cuda {
7569
+ };
7570
+
7571
+ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
7572
+ return GGML_CUDA_NAME;
7573
+
7574
+ UNUSED(backend);
7575
+ }
7576
+
7577
+ static void ggml_backend_cuda_free(ggml_backend_t backend) {
7578
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
7579
+ delete cuda_ctx;
7580
+ delete backend;
7581
+ }
7582
+
7583
+ struct ggml_backend_buffer_context_cuda {
7584
+ void * device;
7585
+
7586
+ ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
7587
+ size_t temp_tensor_extra_index = 0;
7588
+
7589
+ ~ggml_backend_buffer_context_cuda() {
7590
+ delete[] temp_tensor_extras;
7591
+ }
7592
+
7593
+ ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7594
+ if (temp_tensor_extras == nullptr) {
7595
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7596
+ }
7597
+
7598
+ size_t alloc_index = temp_tensor_extra_index;
7599
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7600
+ ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
7601
+ memset(extra, 0, sizeof(*extra));
7602
+
7603
+ return extra;
7604
+ }
7605
+ };
7606
+
7607
+ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
7608
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7609
+ CUDA_CHECK(cudaFree(ctx->device));
7610
+ delete ctx;
7611
+ }
7612
+
7613
+ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
7614
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7615
+ return ctx->device;
7616
+ }
7617
+
7618
+ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7619
+ int64_t row_low = 0;
7620
+ int64_t row_high = ggml_nrows(tensor);
7621
+ int64_t nrows_split = row_high - row_low;
7622
+
7623
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
7624
+
7625
+ int64_t ne0 = tensor->ne[0];
7626
+
7627
+ if (ggml_is_quantized(tensor->type)) {
7628
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
7629
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
7630
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
7631
+ }
7632
+ }
7633
+
7634
+ return size;
7635
+
7636
+ UNUSED(buffer);
7637
+ }
7638
+
7639
+ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7640
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7641
+
7642
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
7643
+ assert(tensor->view_src->buffer->backend == buffer->backend);
7644
+ tensor->backend = tensor->view_src->backend;
7645
+ tensor->extra = tensor->view_src->extra;
7646
+ return;
7647
+ }
7648
+
7649
+ ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
7650
+
7651
+ extra->data_device[g_main_device] = tensor->data;
7652
+
7653
+ tensor->backend = GGML_BACKEND_GPU;
7654
+ tensor->extra = extra;
7655
+
7656
+ if (ggml_is_quantized(tensor->type)) {
7657
+ // initialize padding to 0 to avoid possible NaN values
7658
+ int64_t row_low = 0;
7659
+ int64_t row_high = ggml_nrows(tensor);
7660
+ int64_t nrows_split = row_high - row_low;
7661
+
7662
+ size_t original_size = ggml_nbytes_split(tensor, nrows_split);
7663
+ size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
7664
+
7665
+ if (padded_size > original_size && tensor->view_src == nullptr) {
7666
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
7667
+ }
7668
+ }
7669
+
7670
+ UNUSED(buffer);
7671
+ }
7672
+
7673
+ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
7674
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
7675
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
7676
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
7677
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
7678
+ /* .free_tensor = */ NULL,
7679
+ };
7680
+
7681
+ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
7682
+ ggml_cuda_set_device(g_main_device);
7683
+
7684
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
7685
+ CUDA_CHECK(cudaMalloc(&ctx->device, size));
7686
+ return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
7687
+ }
7688
+
7689
+ static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
7690
+ return 128;
7691
+ UNUSED(backend);
7692
+ }
7693
+
7694
+ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
7695
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
7696
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7697
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7698
+
7699
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
7700
+
7701
+ UNUSED(backend);
7702
+ }
7703
+
7704
+ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
7705
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
7706
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7707
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7708
+
7709
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
7710
+
7711
+ UNUSED(backend);
7712
+ }
7713
+
7714
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
7715
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
7716
+
7717
+ UNUSED(backend);
7718
+ }
7719
+
7720
+ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
7721
+ GGML_ASSERT(!"not implemented");
7722
+
7723
+ return nullptr;
7724
+
7725
+ UNUSED(backend);
7726
+ UNUSED(cgraph);
7727
+ }
7728
+
7729
+ static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7730
+ GGML_ASSERT(!"not implemented");
7731
+
7732
+ UNUSED(backend);
7733
+ UNUSED(plan);
7734
+ }
7735
+
7736
+ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7737
+ GGML_ASSERT(!"not implemented");
7738
+
7739
+ UNUSED(backend);
7740
+ UNUSED(plan);
7741
+ }
7742
+
7743
+ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
7744
+ ggml_cuda_set_device(g_main_device);
7745
+
7746
+ ggml_compute_params params = {};
7747
+ params.type = GGML_TASK_COMPUTE;
7748
+ params.ith = 0;
7749
+ for (int i = 0; i < cgraph->n_nodes; i++) {
7750
+ ggml_tensor * node = cgraph->nodes[i];
7751
+
7752
+ assert(node->backend == GGML_BACKEND_GPU);
7753
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
7754
+ if (node->src[j] != nullptr) {
7755
+ assert(node->src[j]->backend == GGML_BACKEND_GPU);
7756
+ }
7757
+ }
7758
+
7759
+ bool ok = ggml_cuda_compute_forward(&params, node);
7760
+ if (!ok) {
7761
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
7762
+ }
7763
+ GGML_ASSERT(ok);
7764
+
7765
+ #if 0
7766
+ if (node->type == GGML_TYPE_F32) {
7767
+ cudaDeviceSynchronize();
7768
+ std::vector<float> tmp(ggml_nelements(node), 0.0f);
7769
+ cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
7770
+ printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
7771
+ ggml_type_name(node->src[0]->type),
7772
+ node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
7773
+ node->src[0]->name,
7774
+ node->src[1] ? node->src[1]->name : "none");
7775
+ double sum = 0.0;
7776
+ double sq_sum = 0.0;
7777
+ for (int i = 0; i < ggml_nelements(node); i++) {
7778
+ printf("%f ", tmp[i]);
7779
+ sum += tmp[i];
7780
+ sq_sum += tmp[i]*tmp[i];
7781
+ }
7782
+ printf("\n");
7783
+ printf("sum: %f, ", sum);
7784
+ printf("sq_sum: %f\n", sq_sum);
7785
+ }
7786
+ #endif
7787
+ }
7788
+
7789
+ UNUSED(backend);
7790
+ }
7791
+
7792
+ static ggml_backend_i cuda_backend_i = {
7793
+ /* .get_name = */ ggml_backend_cuda_name,
7794
+ /* .free = */ ggml_backend_cuda_free,
7795
+ /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
7796
+ /* .get_alignment = */ ggml_backend_cuda_get_alignment,
7797
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
7798
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
7799
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
7800
+ /* .cpy_tensor_from = */ nullptr,
7801
+ /* .cpy_tensor_to = */ nullptr,
7802
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
7803
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
7804
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
7805
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
7806
+ /* .supports_op = */ nullptr,
7807
+ };
7808
+
7809
+ ggml_backend_t ggml_backend_cuda_init() {
7810
+ ggml_init_cublas(); // TODO: remove from ggml.c
7811
+
7812
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
7813
+
7814
+ ggml_backend_t cuda_backend = new ggml_backend {
7815
+ /* .interface = */ cuda_backend_i,
7816
+ /* .context = */ ctx
7817
+ };
7818
+
7819
+ return cuda_backend;
7820
+ }
ggml-cuda.h CHANGED
@@ -1,6 +1,7 @@
1
  #pragma once
2
 
3
  #include "ggml.h"
 
4
 
5
  #ifdef GGML_USE_HIPBLAS
6
  #define GGML_CUDA_NAME "ROCm"
@@ -42,6 +43,9 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s
42
  GGML_API int ggml_cuda_get_device_count(void);
43
  GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
44
 
 
 
 
45
  #ifdef __cplusplus
46
  }
47
  #endif
 
1
  #pragma once
2
 
3
  #include "ggml.h"
4
+ #include "ggml-backend.h"
5
 
6
  #ifdef GGML_USE_HIPBLAS
7
  #define GGML_CUDA_NAME "ROCm"
 
43
  GGML_API int ggml_cuda_get_device_count(void);
44
  GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
45
 
46
+ // backend API
47
+ GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
48
+
49
  #ifdef __cplusplus
50
  }
51
  #endif
ggml-metal.h CHANGED
@@ -20,6 +20,7 @@
20
  #pragma once
21
 
22
  #include "ggml.h"
 
23
 
24
  #include <stddef.h>
25
  #include <stdbool.h>
@@ -35,10 +36,15 @@ struct ggml_cgraph;
35
  extern "C" {
36
  #endif
37
 
38
- void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 
 
 
39
 
40
  struct ggml_metal_context;
41
 
 
 
42
  // number of command buffers to use
43
  struct ggml_metal_context * ggml_metal_init(int n_cb);
44
  void ggml_metal_free(struct ggml_metal_context * ctx);
@@ -83,6 +89,17 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
83
  // creates gf->n_threads command buffers in parallel
84
  void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
85
 
 
 
 
 
 
 
 
 
 
 
 
86
  #ifdef __cplusplus
87
  }
88
  #endif
 
20
  #pragma once
21
 
22
  #include "ggml.h"
23
+ #include "ggml-backend.h"
24
 
25
  #include <stddef.h>
26
  #include <stdbool.h>
 
36
  extern "C" {
37
  #endif
38
 
39
+ //
40
+ // internal API
41
+ // temporary exposed to user-code
42
+ //
43
 
44
  struct ggml_metal_context;
45
 
46
+ void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
47
+
48
  // number of command buffers to use
49
  struct ggml_metal_context * ggml_metal_init(int n_cb);
50
  void ggml_metal_free(struct ggml_metal_context * ctx);
 
89
  // creates gf->n_threads command buffers in parallel
90
  void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
91
 
92
+ //
93
+ // backend API
94
+ // user-code should use only these functions
95
+ //
96
+
97
+ GGML_API ggml_backend_t ggml_backend_metal_init(void);
98
+
99
+ GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
100
+
101
+ GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
102
+
103
  #ifdef __cplusplus
104
  }
105
  #endif
ggml-metal.m CHANGED
@@ -779,8 +779,8 @@ void ggml_metal_graph_compute(
779
  } break;
780
  case GGML_OP_CONCAT:
781
  {
 
782
 
783
- int64_t nb = ne00;
784
  [encoder setComputePipelineState:ctx->pipeline_concat];
785
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
786
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@@ -812,6 +812,7 @@ void ggml_metal_graph_compute(
812
  [encoder setBytes:&nb length:sizeof(nb) atIndex:27];
813
 
814
  const int nth = MIN(1024, ne0);
 
815
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
816
  } break;
817
  case GGML_OP_ADD:
@@ -909,9 +910,10 @@ void ggml_metal_graph_compute(
909
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
910
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
911
 
912
- const int64_t n = ggml_nelements(dst)/4;
 
913
 
914
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
915
  } break;
916
  case GGML_OP_UNARY:
917
  switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -921,9 +923,10 @@ void ggml_metal_graph_compute(
921
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
922
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
923
 
924
- const int64_t n = ggml_nelements(dst)/4;
 
925
 
926
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
927
  } break;
928
  case GGML_UNARY_OP_RELU:
929
  {
@@ -941,9 +944,10 @@ void ggml_metal_graph_compute(
941
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
942
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
943
 
944
- const int64_t n = ggml_nelements(dst)/4;
 
945
 
946
- [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
947
  } break;
948
  default:
949
  {
@@ -1040,7 +1044,7 @@ void ggml_metal_graph_compute(
1040
  !ggml_is_transposed(src0) &&
1041
  !ggml_is_transposed(src1) &&
1042
  src1t == GGML_TYPE_F32 &&
1043
- ne00 % 32 == 0 &&
1044
  ne11 > ne11_mm_min) {
1045
  //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
1046
  switch (src0->type) {
@@ -1251,6 +1255,8 @@ void ggml_metal_graph_compute(
1251
  } break;
1252
  case GGML_OP_RMS_NORM:
1253
  {
 
 
1254
  float eps;
1255
  memcpy(&eps, dst->op_params, sizeof(float));
1256
 
@@ -1293,7 +1299,7 @@ void ggml_metal_graph_compute(
1293
 
1294
  const int nth = MIN(1024, ne00);
1295
 
1296
- const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
1297
  const int n_head = ((int32_t *) dst->op_params)[1];
1298
  float max_bias;
1299
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
@@ -1471,3 +1477,140 @@ preferably one under the recommended max working set size, or else fall back to
1471
 
1472
  }
1473
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779
  } break;
780
  case GGML_OP_CONCAT:
781
  {
782
+ const int64_t nb = ne00;
783
 
 
784
  [encoder setComputePipelineState:ctx->pipeline_concat];
785
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
786
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
 
812
  [encoder setBytes:&nb length:sizeof(nb) atIndex:27];
813
 
814
  const int nth = MIN(1024, ne0);
815
+
816
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
817
  } break;
818
  case GGML_OP_ADD:
 
910
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
911
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
912
 
913
+ const int64_t n = ggml_nelements(dst);
914
+ GGML_ASSERT(n % 4 == 0);
915
 
916
+ [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
917
  } break;
918
  case GGML_OP_UNARY:
919
  switch (ggml_get_unary_op(gf->nodes[i])) {
 
923
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
924
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
925
 
926
+ const int64_t n = ggml_nelements(dst);
927
+ GGML_ASSERT(n % 4 == 0);
928
 
929
+ [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
930
  } break;
931
  case GGML_UNARY_OP_RELU:
932
  {
 
944
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
945
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
946
 
947
+ const int64_t n = ggml_nelements(dst);
948
+ GGML_ASSERT(n % 4 == 0);
949
 
950
+ [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
951
  } break;
952
  default:
953
  {
 
1044
  !ggml_is_transposed(src0) &&
1045
  !ggml_is_transposed(src1) &&
1046
  src1t == GGML_TYPE_F32 &&
1047
+ ne00 % 32 == 0 && ne00 >= 64 &&
1048
  ne11 > ne11_mm_min) {
1049
  //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
1050
  switch (src0->type) {
 
1255
  } break;
1256
  case GGML_OP_RMS_NORM:
1257
  {
1258
+ GGML_ASSERT(ne00 % 4 == 0);
1259
+
1260
  float eps;
1261
  memcpy(&eps, dst->op_params, sizeof(float));
1262
 
 
1299
 
1300
  const int nth = MIN(1024, ne00);
1301
 
1302
+ //const int n_past = ((int32_t *) dst->op_params)[0];
1303
  const int n_head = ((int32_t *) dst->op_params)[1];
1304
  float max_bias;
1305
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
 
1477
 
1478
  }
1479
  }
1480
+
1481
+ ////////////////////////////////////////////////////////////////////////////////
1482
+
1483
+ // backend interface
1484
+
1485
+ static const char * ggml_backend_metal_name(ggml_backend_t backend) {
1486
+ return "Metal";
1487
+
1488
+ UNUSED(backend);
1489
+ }
1490
+
1491
+ static void ggml_backend_metal_free(ggml_backend_t backend) {
1492
+ struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
1493
+ ggml_metal_free(ctx);
1494
+ free(backend);
1495
+ }
1496
+
1497
+ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
1498
+ return (void *)buffer->context;
1499
+ }
1500
+
1501
+ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1502
+ free(buffer->context);
1503
+ UNUSED(buffer);
1504
+ }
1505
+
1506
+ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
1507
+ /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
1508
+ /* .get_base = */ ggml_backend_metal_buffer_get_base,
1509
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1510
+ /* .init_tensor = */ NULL, // no initialization required
1511
+ /* .free_tensor = */ NULL, // no cleanup required
1512
+ };
1513
+
1514
+ static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
1515
+ struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
1516
+
1517
+ void * data = ggml_metal_host_malloc(size);
1518
+
1519
+ // TODO: set proper name of the buffers
1520
+ ggml_metal_add_buffer(ctx, "backend", data, size, 0);
1521
+
1522
+ return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
1523
+ }
1524
+
1525
+ static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
1526
+ return 32;
1527
+ UNUSED(backend);
1528
+ }
1529
+
1530
+ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1531
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
1532
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
1533
+
1534
+ memcpy((char *)tensor->data + offset, data, size);
1535
+
1536
+ UNUSED(backend);
1537
+ }
1538
+
1539
+ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1540
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
1541
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
1542
+
1543
+ memcpy(data, (const char *)tensor->data + offset, size);
1544
+
1545
+ UNUSED(backend);
1546
+ }
1547
+
1548
+ static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
1549
+ UNUSED(backend);
1550
+ }
1551
+
1552
+ static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
1553
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
1554
+
1555
+ UNUSED(backend);
1556
+ }
1557
+
1558
+ static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
1559
+ ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
1560
+
1561
+ UNUSED(backend);
1562
+ }
1563
+
1564
+ static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1565
+ struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
1566
+
1567
+ ggml_metal_graph_compute(metal_ctx, cgraph);
1568
+ }
1569
+
1570
+ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
1571
+ return true;
1572
+ UNUSED(backend);
1573
+ UNUSED(op);
1574
+ }
1575
+
1576
+ static struct ggml_backend_i metal_backend_i = {
1577
+ /* .get_name = */ ggml_backend_metal_name,
1578
+ /* .free = */ ggml_backend_metal_free,
1579
+ /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
1580
+ /* .get_alignment = */ ggml_backend_metal_get_alignment,
1581
+ /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
1582
+ /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
1583
+ /* .synchronize = */ ggml_backend_metal_synchronize,
1584
+ /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from,
1585
+ /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to,
1586
+ /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
1587
+ /* .graph_plan_free = */ NULL,
1588
+ /* .graph_plan_compute = */ NULL,
1589
+ /* .graph_compute = */ ggml_backend_metal_graph_compute,
1590
+ /* .supports_op = */ ggml_backend_metal_supports_op,
1591
+ };
1592
+
1593
+ ggml_backend_t ggml_backend_metal_init(void) {
1594
+ struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
1595
+
1596
+ ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
1597
+
1598
+ ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
1599
+
1600
+ *metal_backend = (struct ggml_backend) {
1601
+ /* .interface = */ metal_backend_i,
1602
+ /* .context = */ ctx,
1603
+ };
1604
+
1605
+ return metal_backend;
1606
+ }
1607
+
1608
+ bool ggml_backend_is_metal(ggml_backend_t backend) {
1609
+ return backend->iface.get_name == ggml_backend_metal_name;
1610
+ }
1611
+
1612
+ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
1613
+ struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
1614
+
1615
+ ggml_metal_set_n_cb(ctx, n_cb);
1616
+ }
ggml-metal.metal CHANGED
@@ -345,10 +345,11 @@ kernel void kernel_rms_norm(
345
  uint sgitg[[simdgroup_index_in_threadgroup]],
346
  uint tiisg[[thread_index_in_simdgroup]],
347
  uint ntg[[threads_per_threadgroup]]) {
348
- device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
349
- device const float * x_scalar = (device const float *) x;
350
- float4 sumf=0;
351
- float all_sum=0;
 
352
 
353
  // parallel sum
354
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
@@ -361,6 +362,7 @@ kernel void kernel_rms_norm(
361
  }
362
 
363
  threadgroup_barrier(mem_flags::mem_threadgroup);
 
364
  // broadcast, simd group number is ntg / 32
365
  for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
366
  if (tpitg < i) {
@@ -368,7 +370,9 @@ kernel void kernel_rms_norm(
368
  }
369
  }
370
  if (tpitg == 0) {
371
- for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
 
 
372
  sum[0] /= ne00;
373
  }
374
 
@@ -383,7 +387,9 @@ kernel void kernel_rms_norm(
383
  y[i00] = x[i00] * scale;
384
  }
385
  if (tpitg == 0) {
386
- for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
 
 
387
  }
388
  }
389
 
 
345
  uint sgitg[[simdgroup_index_in_threadgroup]],
346
  uint tiisg[[thread_index_in_simdgroup]],
347
  uint ntg[[threads_per_threadgroup]]) {
348
+ device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
349
+ device const float * x_scalar = (device const float *) x;
350
+
351
+ float4 sumf = 0;
352
+ float all_sum = 0;
353
 
354
  // parallel sum
355
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
 
362
  }
363
 
364
  threadgroup_barrier(mem_flags::mem_threadgroup);
365
+
366
  // broadcast, simd group number is ntg / 32
367
  for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
368
  if (tpitg < i) {
 
370
  }
371
  }
372
  if (tpitg == 0) {
373
+ for (int i = 4 * (ne00 / 4); i < ne00; i++) {
374
+ sum[0] += x_scalar[i];
375
+ }
376
  sum[0] /= ne00;
377
  }
378
 
 
387
  y[i00] = x[i00] * scale;
388
  }
389
  if (tpitg == 0) {
390
+ for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
391
+ y_scalar[i00] = x_scalar[i00] * scale;
392
+ }
393
  }
394
  }
395
 
ggml.c CHANGED
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
162
 
163
  #define GGML_PRINT(...) printf(__VA_ARGS__)
164
 
 
 
 
 
165
  #ifdef GGML_USE_ACCELERATE
166
  // uncomment to use vDSP for soft max computation
167
  // note: not sure if it is actually faster
168
  //#define GGML_SOFT_MAX_ACCELERATE
169
  #endif
170
 
171
- //
172
- // logging
173
- //
174
-
175
- #if (GGML_DEBUG >= 1)
176
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
177
- #else
178
- #define GGML_PRINT_DEBUG(...)
179
- #endif
180
-
181
- #if (GGML_DEBUG >= 5)
182
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
183
- #else
184
- #define GGML_PRINT_DEBUG_5(...)
185
- #endif
186
-
187
- #if (GGML_DEBUG >= 10)
188
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
189
- #else
190
- #define GGML_PRINT_DEBUG_10(...)
191
- #endif
192
-
193
- #define GGML_PRINT(...) printf(__VA_ARGS__)
194
-
195
- //
196
- // end of logging block
197
- //
198
-
199
  #if defined(_MSC_VER) || defined(__MINGW32__)
200
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
201
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -4952,6 +4928,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4952
  *result = (struct ggml_tensor) {
4953
  /*.type =*/ type,
4954
  /*.backend =*/ GGML_BACKEND_CPU,
 
4955
  /*.n_dims =*/ n_dims,
4956
  /*.ne =*/ { 1, 1, 1, 1 },
4957
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -11257,7 +11234,7 @@ static void ggml_compute_forward_silu_f32(
11257
 
11258
  #ifndef NDEBUG
11259
  for (int k = 0; k < nc; k++) {
11260
- const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
11261
  UNUSED(x);
11262
  assert(!isnan(x));
11263
  assert(!isinf(x));
@@ -13083,24 +13060,22 @@ static void ggml_compute_forward_alibi_f32(
13083
  return;
13084
  }
13085
 
13086
- const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
13087
  const int n_head = ((int32_t *) dst->op_params)[1];
13088
  float max_bias;
13089
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13090
 
13091
- assert(n_past >= 0);
13092
-
13093
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13094
- const int ne1 = src0->ne[1]; // seq_len_without_past
13095
- const int ne2 = src0->ne[2]; // n_head -> this is k
13096
- //const int ne3 = src0->ne[3]; // 1 -> bsz
13097
 
13098
- const int n = ggml_nrows(src0);
13099
- const int ne2_ne3 = n/ne1; // ne2*ne3
13100
 
13101
- const int nb0 = src0->nb[0];
13102
- const int nb1 = src0->nb[1];
13103
- const int nb2 = src0->nb[2];
13104
  //const int nb3 = src0->nb[3];
13105
 
13106
  GGML_ASSERT(nb0 == sizeof(float));
@@ -13112,9 +13087,9 @@ static void ggml_compute_forward_alibi_f32(
13112
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13113
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13114
 
13115
- for (int i = 0; i < ne0; i++) {
13116
- for (int j = 0; j < ne1; j++) {
13117
- for (int k = 0; k < ne2_ne3; k++) {
13118
  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13119
  float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13120
 
@@ -13129,7 +13104,6 @@ static void ggml_compute_forward_alibi_f32(
13129
  }
13130
 
13131
  pdst[0] = i * m_k + src[0];
13132
-
13133
  }
13134
  }
13135
  }
@@ -20200,6 +20174,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
20200
  ggml_vec_cpy_f32(nx, xp, x);
20201
  ggml_vec_cpy_f32(nx, gp, g);
20202
 
 
 
 
 
20203
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
20204
  if (cancel) {
20205
  return GGML_OPT_CANCEL;
 
162
 
163
  #define GGML_PRINT(...) printf(__VA_ARGS__)
164
 
165
+ //
166
+ // end of logging block
167
+ //
168
+
169
  #ifdef GGML_USE_ACCELERATE
170
  // uncomment to use vDSP for soft max computation
171
  // note: not sure if it is actually faster
172
  //#define GGML_SOFT_MAX_ACCELERATE
173
  #endif
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  #if defined(_MSC_VER) || defined(__MINGW32__)
176
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
177
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
 
4928
  *result = (struct ggml_tensor) {
4929
  /*.type =*/ type,
4930
  /*.backend =*/ GGML_BACKEND_CPU,
4931
+ /*.buffer =*/ NULL,
4932
  /*.n_dims =*/ n_dims,
4933
  /*.ne =*/ { 1, 1, 1, 1 },
4934
  /*.nb =*/ { 0, 0, 0, 0 },
 
11234
 
11235
  #ifndef NDEBUG
11236
  for (int k = 0; k < nc; k++) {
11237
+ const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
11238
  UNUSED(x);
11239
  assert(!isnan(x));
11240
  assert(!isinf(x));
 
13060
  return;
13061
  }
13062
 
13063
+ //const int n_past = ((int32_t *) dst->op_params)[0];
13064
  const int n_head = ((int32_t *) dst->op_params)[1];
13065
  float max_bias;
13066
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13067
 
13068
+ const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13069
+ const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13070
+ const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13071
+ //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
 
 
13072
 
13073
+ const int64_t n = ggml_nrows(src0);
13074
+ const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13075
 
13076
+ const size_t nb0 = src0->nb[0];
13077
+ const size_t nb1 = src0->nb[1];
13078
+ const size_t nb2 = src0->nb[2];
13079
  //const int nb3 = src0->nb[3];
13080
 
13081
  GGML_ASSERT(nb0 == sizeof(float));
 
13087
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13088
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13089
 
13090
+ for (int64_t i = 0; i < ne0; i++) {
13091
+ for (int64_t j = 0; j < ne1; j++) {
13092
+ for (int64_t k = 0; k < ne2_ne3; k++) {
13093
  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13094
  float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13095
 
 
13104
  }
13105
 
13106
  pdst[0] = i * m_k + src[0];
 
13107
  }
13108
  }
13109
  }
 
20174
  ggml_vec_cpy_f32(nx, xp, x);
20175
  ggml_vec_cpy_f32(nx, gp, g);
20176
 
20177
+ // TODO: instead of passing &cancel here, use the return code of the linesearch
20178
+ // to determine if the optimization should be cancelled
20179
+ // this is a simple change, but not doing this atm, since I don't have a nice
20180
+ // way to test and don't want to break something with so many changes lined up
20181
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
20182
  if (cancel) {
20183
  return GGML_OPT_CANCEL;
ggml.h CHANGED
@@ -326,7 +326,7 @@ extern "C" {
326
  GGML_TYPE_COUNT,
327
  };
328
 
329
- enum ggml_backend {
330
  GGML_BACKEND_CPU = 0,
331
  GGML_BACKEND_GPU = 10,
332
  GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,8 +479,10 @@ extern "C" {
479
 
480
  // n-dimensional tensor
481
  struct ggml_tensor {
482
- enum ggml_type type;
483
- enum ggml_backend backend;
 
 
484
 
485
  int n_dims;
486
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -514,7 +516,7 @@ extern "C" {
514
 
515
  void * extra; // extra things e.g. for ggml-cuda.cu
516
 
517
- char padding[4];
518
  };
519
 
520
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -1358,7 +1360,7 @@ extern "C" {
1358
 
1359
  // alibi position embedding
1360
  // in-place, returns view(a)
1361
- struct ggml_tensor * ggml_alibi(
1362
  struct ggml_context * ctx,
1363
  struct ggml_tensor * a,
1364
  int n_past,
@@ -1367,7 +1369,7 @@ extern "C" {
1367
 
1368
  // clamp
1369
  // in-place, returns view(a)
1370
- struct ggml_tensor * ggml_clamp(
1371
  struct ggml_context * ctx,
1372
  struct ggml_tensor * a,
1373
  float min,
@@ -2102,7 +2104,7 @@ extern "C" {
2102
  enum ggml_type vec_dot_type;
2103
  } ggml_type_traits_t;
2104
 
2105
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2106
 
2107
  #ifdef __cplusplus
2108
  }
 
326
  GGML_TYPE_COUNT,
327
  };
328
 
329
+ enum ggml_backend_type {
330
  GGML_BACKEND_CPU = 0,
331
  GGML_BACKEND_GPU = 10,
332
  GGML_BACKEND_GPU_SPLIT = 20,
 
479
 
480
  // n-dimensional tensor
481
  struct ggml_tensor {
482
+ enum ggml_type type;
483
+ enum ggml_backend_type backend;
484
+
485
+ struct ggml_backend_buffer * buffer;
486
 
487
  int n_dims;
488
  int64_t ne[GGML_MAX_DIMS]; // number of elements
 
516
 
517
  void * extra; // extra things e.g. for ggml-cuda.cu
518
 
519
+ char padding[12];
520
  };
521
 
522
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
1360
 
1361
  // alibi position embedding
1362
  // in-place, returns view(a)
1363
+ GGML_API struct ggml_tensor * ggml_alibi(
1364
  struct ggml_context * ctx,
1365
  struct ggml_tensor * a,
1366
  int n_past,
 
1369
 
1370
  // clamp
1371
  // in-place, returns view(a)
1372
+ GGML_API struct ggml_tensor * ggml_clamp(
1373
  struct ggml_context * ctx,
1374
  struct ggml_tensor * a,
1375
  float min,
 
2104
  enum ggml_type vec_dot_type;
2105
  } ggml_type_traits_t;
2106
 
2107
+ GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2108
 
2109
  #ifdef __cplusplus
2110
  }
gguf-py/gguf/gguf.py CHANGED
@@ -88,29 +88,31 @@ class MODEL_ARCH(IntEnum):
88
  PERSIMMON : int = auto()
89
  REFACT : int = auto()
90
  BERT : int = auto()
 
91
 
92
 
93
  class MODEL_TENSOR(IntEnum):
94
- TOKEN_EMBD : int = auto()
95
- TOKEN_TYPES : int = auto()
96
- POS_EMBD : int = auto()
97
- OUTPUT : int = auto()
98
- OUTPUT_NORM : int = auto()
99
- ROPE_FREQS : int = auto()
100
- ATTN_Q : int = auto()
101
- ATTN_K : int = auto()
102
- ATTN_V : int = auto()
103
- ATTN_QKV : int = auto()
104
- ATTN_OUT : int = auto()
105
- ATTN_NORM : int = auto()
106
- ATTN_NORM_2 : int = auto()
107
- ATTN_ROT_EMBD: int = auto()
108
- FFN_GATE : int = auto()
109
- FFN_DOWN : int = auto()
110
- FFN_UP : int = auto()
111
- FFN_NORM : int = auto()
112
- ATTN_Q_NORM : int = auto()
113
- ATTN_K_NORM : int = auto()
 
114
 
115
 
116
  MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -125,29 +127,31 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
125
  MODEL_ARCH.PERSIMMON: "persimmon",
126
  MODEL_ARCH.REFACT: "refact",
127
  MODEL_ARCH.BERT: "bert",
 
128
  }
129
 
130
  TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
131
- MODEL_TENSOR.TOKEN_EMBD: "token_embd",
132
- MODEL_TENSOR.TOKEN_TYPES: "token_types",
133
- MODEL_TENSOR.POS_EMBD: "position_embd",
134
- MODEL_TENSOR.OUTPUT_NORM: "output_norm",
135
- MODEL_TENSOR.OUTPUT: "output",
136
- MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
137
- MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
138
- MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
139
- MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
140
- MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
141
- MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
142
- MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
143
- MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
144
- MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
145
- MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
146
- MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
147
- MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
148
- MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
149
- MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
150
- MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
 
151
  }
152
 
153
  MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -282,6 +286,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
282
  MODEL_TENSOR.FFN_DOWN,
283
  MODEL_TENSOR.FFN_UP,
284
  ],
 
 
 
 
 
 
 
 
 
 
 
 
285
  MODEL_ARCH.GPT2: [
286
  # TODO
287
  ],
@@ -311,6 +327,7 @@ class TensorNameMap:
311
  "gpt_neox.embed_in", # gptneox
312
  "transformer.wte", # gpt2 gpt-j mpt refact
313
  "transformer.word_embeddings", # falcon
 
314
  "model.embed_tokens", # llama-hf
315
  "tok_embeddings", # llama-pth
316
  "embeddings.word_embeddings", # bert
@@ -322,6 +339,11 @@ class TensorNameMap:
322
  "embeddings.token_type_embeddings", # bert
323
  ),
324
 
 
 
 
 
 
325
  # Position embeddings
326
  MODEL_TENSOR.POS_EMBD: (
327
  "transformer.wpe", # gpt2
@@ -332,7 +354,7 @@ class TensorNameMap:
332
  MODEL_TENSOR.OUTPUT: (
333
  "embed_out", # gptneox
334
  "lm_head", # gpt2 mpt falcon llama-hf baichuan
335
- "output", # llama-pth
336
  "word_embeddings_for_head", # persimmon
337
  ),
338
 
@@ -344,7 +366,7 @@ class TensorNameMap:
344
  "norm", # llama-pth
345
  "embeddings.LayerNorm", # bert
346
  "transformer.norm_f", # mpt
347
- "ln_f", # refact
348
  "language_model.encoder.final_layernorm", # persimmon
349
  ),
350
 
@@ -361,6 +383,7 @@ class TensorNameMap:
361
  "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
362
  "transformer.blocks.{bid}.norm_1", # mpt
363
  "transformer.h.{bid}.input_layernorm", # falcon7b
 
364
  "transformer.h.{bid}.ln_mlp", # falcon40b
365
  "model.layers.{bid}.input_layernorm", # llama-hf
366
  "layers.{bid}.attention_norm", # llama-pth
@@ -379,6 +402,7 @@ class TensorNameMap:
379
  "transformer.h.{bid}.attn.c_attn", # gpt2
380
  "transformer.blocks.{bid}.attn.Wqkv", # mpt
381
  "transformer.h.{bid}.self_attention.query_key_value", # falcon
 
382
  "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
383
  ),
384
 
@@ -412,6 +436,7 @@ class TensorNameMap:
412
  "transformer.h.{bid}.attn.c_proj", # gpt2 refact
413
  "transformer.blocks.{bid}.attn.out_proj", # mpt
414
  "transformer.h.{bid}.self_attention.dense", # falcon
 
415
  "model.layers.{bid}.self_attn.o_proj", # llama-hf
416
  "layers.{bid}.attention.wo", # llama-pth
417
  "encoder.layer.{bid}.attention.output.dense", # bert
@@ -429,6 +454,7 @@ class TensorNameMap:
429
  MODEL_TENSOR.FFN_NORM: (
430
  "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
431
  "transformer.h.{bid}.ln_2", # gpt2 refact
 
432
  "transformer.blocks.{bid}.norm_2", # mpt
433
  "model.layers.{bid}.post_attention_layernorm", # llama-hf
434
  "layers.{bid}.ffn_norm", # llama-pth
@@ -442,6 +468,7 @@ class TensorNameMap:
442
  "transformer.h.{bid}.mlp.c_fc", # gpt2
443
  "transformer.blocks.{bid}.ffn.up_proj", # mpt
444
  "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
 
445
  "model.layers.{bid}.mlp.up_proj", # llama-hf refact
446
  "layers.{bid}.feed_forward.w3", # llama-pth
447
  "encoder.layer.{bid}.intermediate.dense", # bert
@@ -461,6 +488,7 @@ class TensorNameMap:
461
  "transformer.h.{bid}.mlp.c_proj", # gpt2 refact
462
  "transformer.blocks.{bid}.ffn.down_proj", # mpt
463
  "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
 
464
  "model.layers.{bid}.mlp.down_proj", # llama-hf
465
  "layers.{bid}.feed_forward.w2", # llama-pth
466
  "encoder.layer.{bid}.output.dense", # bert
 
88
  PERSIMMON : int = auto()
89
  REFACT : int = auto()
90
  BERT : int = auto()
91
+ BLOOM : int = auto()
92
 
93
 
94
  class MODEL_TENSOR(IntEnum):
95
+ TOKEN_EMBD : int = auto()
96
+ TOKEN_EMBD_NORM : int = auto()
97
+ TOKEN_TYPES : int = auto()
98
+ POS_EMBD : int = auto()
99
+ OUTPUT : int = auto()
100
+ OUTPUT_NORM : int = auto()
101
+ ROPE_FREQS : int = auto()
102
+ ATTN_Q : int = auto()
103
+ ATTN_K : int = auto()
104
+ ATTN_V : int = auto()
105
+ ATTN_QKV : int = auto()
106
+ ATTN_OUT : int = auto()
107
+ ATTN_NORM : int = auto()
108
+ ATTN_NORM_2 : int = auto()
109
+ ATTN_ROT_EMBD : int = auto()
110
+ FFN_GATE : int = auto()
111
+ FFN_DOWN : int = auto()
112
+ FFN_UP : int = auto()
113
+ FFN_NORM : int = auto()
114
+ ATTN_Q_NORM : int = auto()
115
+ ATTN_K_NORM : int = auto()
116
 
117
 
118
  MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
 
127
  MODEL_ARCH.PERSIMMON: "persimmon",
128
  MODEL_ARCH.REFACT: "refact",
129
  MODEL_ARCH.BERT: "bert",
130
+ MODEL_ARCH.BLOOM: "bloom",
131
  }
132
 
133
  TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
134
+ MODEL_TENSOR.TOKEN_EMBD: "token_embd",
135
+ MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
136
+ MODEL_TENSOR.TOKEN_TYPES: "token_types",
137
+ MODEL_TENSOR.POS_EMBD: "position_embd",
138
+ MODEL_TENSOR.OUTPUT_NORM: "output_norm",
139
+ MODEL_TENSOR.OUTPUT: "output",
140
+ MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
141
+ MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
142
+ MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
143
+ MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
144
+ MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
145
+ MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
146
+ MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
147
+ MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
148
+ MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
149
+ MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
150
+ MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
151
+ MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
152
+ MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
153
+ MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
154
+ MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
155
  }
156
 
157
  MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
 
286
  MODEL_TENSOR.FFN_DOWN,
287
  MODEL_TENSOR.FFN_UP,
288
  ],
289
+ MODEL_ARCH.BLOOM: [
290
+ MODEL_TENSOR.TOKEN_EMBD,
291
+ MODEL_TENSOR.TOKEN_EMBD_NORM,
292
+ MODEL_TENSOR.OUTPUT_NORM,
293
+ MODEL_TENSOR.OUTPUT,
294
+ MODEL_TENSOR.ATTN_NORM,
295
+ MODEL_TENSOR.ATTN_QKV,
296
+ MODEL_TENSOR.ATTN_OUT,
297
+ MODEL_TENSOR.FFN_NORM,
298
+ MODEL_TENSOR.FFN_DOWN,
299
+ MODEL_TENSOR.FFN_UP,
300
+ ],
301
  MODEL_ARCH.GPT2: [
302
  # TODO
303
  ],
 
327
  "gpt_neox.embed_in", # gptneox
328
  "transformer.wte", # gpt2 gpt-j mpt refact
329
  "transformer.word_embeddings", # falcon
330
+ "word_embeddings", # bloom
331
  "model.embed_tokens", # llama-hf
332
  "tok_embeddings", # llama-pth
333
  "embeddings.word_embeddings", # bert
 
339
  "embeddings.token_type_embeddings", # bert
340
  ),
341
 
342
+ # Normalization of token embeddings
343
+ MODEL_TENSOR.TOKEN_EMBD_NORM: (
344
+ "word_embeddings_layernorm", # bloom
345
+ ),
346
+
347
  # Position embeddings
348
  MODEL_TENSOR.POS_EMBD: (
349
  "transformer.wpe", # gpt2
 
354
  MODEL_TENSOR.OUTPUT: (
355
  "embed_out", # gptneox
356
  "lm_head", # gpt2 mpt falcon llama-hf baichuan
357
+ "output", # llama-pth bloom
358
  "word_embeddings_for_head", # persimmon
359
  ),
360
 
 
366
  "norm", # llama-pth
367
  "embeddings.LayerNorm", # bert
368
  "transformer.norm_f", # mpt
369
+ "ln_f", # refact bloom
370
  "language_model.encoder.final_layernorm", # persimmon
371
  ),
372
 
 
383
  "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
384
  "transformer.blocks.{bid}.norm_1", # mpt
385
  "transformer.h.{bid}.input_layernorm", # falcon7b
386
+ "h.{bid}.input_layernorm", # bloom
387
  "transformer.h.{bid}.ln_mlp", # falcon40b
388
  "model.layers.{bid}.input_layernorm", # llama-hf
389
  "layers.{bid}.attention_norm", # llama-pth
 
402
  "transformer.h.{bid}.attn.c_attn", # gpt2
403
  "transformer.blocks.{bid}.attn.Wqkv", # mpt
404
  "transformer.h.{bid}.self_attention.query_key_value", # falcon
405
+ "h.{bid}.self_attention.query_key_value", # bloom
406
  "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
407
  ),
408
 
 
436
  "transformer.h.{bid}.attn.c_proj", # gpt2 refact
437
  "transformer.blocks.{bid}.attn.out_proj", # mpt
438
  "transformer.h.{bid}.self_attention.dense", # falcon
439
+ "h.{bid}.self_attention.dense", # bloom
440
  "model.layers.{bid}.self_attn.o_proj", # llama-hf
441
  "layers.{bid}.attention.wo", # llama-pth
442
  "encoder.layer.{bid}.attention.output.dense", # bert
 
454
  MODEL_TENSOR.FFN_NORM: (
455
  "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
456
  "transformer.h.{bid}.ln_2", # gpt2 refact
457
+ "h.{bid}.post_attention_layernorm", # bloom
458
  "transformer.blocks.{bid}.norm_2", # mpt
459
  "model.layers.{bid}.post_attention_layernorm", # llama-hf
460
  "layers.{bid}.ffn_norm", # llama-pth
 
468
  "transformer.h.{bid}.mlp.c_fc", # gpt2
469
  "transformer.blocks.{bid}.ffn.up_proj", # mpt
470
  "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
471
+ "h.{bid}.mlp.dense_h_to_4h", # bloom
472
  "model.layers.{bid}.mlp.up_proj", # llama-hf refact
473
  "layers.{bid}.feed_forward.w3", # llama-pth
474
  "encoder.layer.{bid}.intermediate.dense", # bert
 
488
  "transformer.h.{bid}.mlp.c_proj", # gpt2 refact
489
  "transformer.blocks.{bid}.ffn.down_proj", # mpt
490
  "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
491
+ "h.{bid}.mlp.dense_4h_to_h", # bloom
492
  "model.layers.{bid}.mlp.down_proj", # llama-hf
493
  "layers.{bid}.feed_forward.w2", # llama-pth
494
  "encoder.layer.{bid}.output.dense", # bert
gpttype_adapter.cpp CHANGED
@@ -1768,7 +1768,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1768
  int realnpredict = params.n_predict-stopper_unused_tokens;
1769
  float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
1770
  float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
1771
- printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)", time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
1772
  fflush(stdout);
1773
  output.status = 1;
1774
  generation_finished = true;
 
1768
  int realnpredict = params.n_predict-stopper_unused_tokens;
1769
  float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
1770
  float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
1771
+ printf("\nContextLimit: %d/%d, Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
1772
  fflush(stdout);
1773
  output.status = 1;
1774
  generation_finished = true;
koboldcpp.py CHANGED
@@ -184,6 +184,10 @@ def init_library():
184
  os.add_dll_directory(dir_path)
185
  os.add_dll_directory(abs_path)
186
  os.add_dll_directory(os.getcwd())
 
 
 
 
187
  handle = ctypes.CDLL(os.path.join(dir_path, libname))
188
 
189
  handle.load_model.argtypes = [load_model_inputs]
@@ -361,7 +365,7 @@ maxhordelen = 256
361
  modelbusy = threading.Lock()
362
  requestsinqueue = 0
363
  defaultport = 5001
364
- KcppVersion = "1.46.1"
365
  showdebug = True
366
  showsamplerwarning = True
367
  showmaxctxwarning = True
@@ -369,6 +373,8 @@ session_kudos_earned = 0
369
  session_jobs = 0
370
  session_starttime = None
371
  exitcounter = 0
 
 
372
  totalgens = 0
373
  currentusergenkey = "" #store a special key so polled streaming works even in multiuser
374
  args = None #global args
@@ -412,16 +418,34 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
412
  elif api_format==4:
413
  # translate openai chat completion messages format into one big string.
414
  messages_array = genparams.get('messages', [])
 
415
  messages_string = ""
 
 
 
 
 
 
 
416
  for message in messages_array:
417
  if message['role'] == "system":
418
- messages_string+="\n### Instruction:\n"
419
  elif message['role'] == "user":
420
- messages_string+="\n### Instruction:\n"
421
  elif message['role'] == "assistant":
422
- messages_string+="\n### Response:\n"
423
- messages_string+=message['content']
424
- messages_string += "\n### Response:\n"
 
 
 
 
 
 
 
 
 
 
425
  genparams["prompt"] = messages_string
426
  frqp = genparams.get('frequency_penalty', 0.1)
427
  scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
@@ -497,9 +521,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
497
  async def handle_sse_stream(self, api_format):
498
  global friendlymodelname
499
  self.send_response(200)
500
- self.send_header("Cache-Control", "no-cache")
501
- self.send_header("Connection", "keep-alive")
502
- self.end_headers(force_json=True, sse_stream_flag=True)
503
 
504
  current_token = 0
505
  incomplete_token_buffer = bytearray()
@@ -566,10 +590,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
566
  global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
567
  self.path = self.path.rstrip('/')
568
  response_body = None
569
- force_json = False
570
 
571
  if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
572
-
573
  if self.embedded_kailite is None:
574
  response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
575
  else:
@@ -615,9 +639,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
615
 
616
  elif self.path.endswith('/v1/models'):
617
  response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
618
- force_json = True
619
 
620
  elif self.path=="/api":
 
621
  if self.embedded_kcpp_docs is None:
622
  response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
623
  else:
@@ -625,41 +649,40 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
625
  elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
626
  self.path = "/api"
627
  self.send_response(302)
628
- self.send_header("Location", self.path)
629
- self.end_headers()
630
  return None
631
 
632
  if response_body is None:
633
  self.send_response(404)
634
- self.end_headers()
635
  rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
636
  self.wfile.write(rp.encode())
637
  else:
638
  self.send_response(200)
639
- self.send_header('Content-Length', str(len(response_body)))
640
- self.end_headers(force_json=force_json)
641
  self.wfile.write(response_body)
642
  return
643
 
644
  def do_POST(self):
645
  global modelbusy, requestsinqueue, currentusergenkey, totalgens
646
- content_length = int(self.headers['Content-Length'])
647
  body = self.rfile.read(content_length)
648
  self.path = self.path.rstrip('/')
649
- force_json = False
650
  if self.path.endswith(('/api/extra/tokencount')):
651
  try:
652
  genparams = json.loads(body)
653
  countprompt = genparams.get('prompt', "")
654
  count = handle.token_count(countprompt.encode("UTF-8"))
655
  self.send_response(200)
656
- self.end_headers()
657
  self.wfile.write(json.dumps({"value": count}).encode())
658
 
659
  except ValueError as e:
660
  utfprint("Count Tokens - Body Error: " + str(e))
661
  self.send_response(400)
662
- self.end_headers()
663
  self.wfile.write(json.dumps({"value": -1}).encode())
664
  return
665
 
@@ -672,11 +695,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
672
  multiuserkey = ""
673
  pass
674
 
675
- if (multiuserkey!="" and multiuserkey==currentusergenkey) or requestsinqueue==0:
676
  ag = handle.abort_generate()
677
  time.sleep(0.3) #short delay before replying
678
  self.send_response(200)
679
- self.end_headers()
680
  self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
681
  print("\nGeneration Aborted")
682
  else:
@@ -694,11 +717,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
694
  pass
695
 
696
  if totalgens>0:
697
- if (multiuserkey!="" and multiuserkey==currentusergenkey) or requestsinqueue==0:
698
  pendtxt = handle.get_pending_output()
699
  pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
700
  self.send_response(200)
701
- self.end_headers()
702
  self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
703
  return
704
 
@@ -708,7 +731,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
708
  requestsinqueue += 1
709
  if not modelbusy.acquire(blocking=reqblocking):
710
  self.send_response(503)
711
- self.end_headers()
712
  self.wfile.write(json.dumps({"detail": {
713
  "msg": "Server is busy; please try again later.",
714
  "type": "service_unavailable",
@@ -734,11 +757,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
734
 
735
  if self.path.endswith('/v1/completions'):
736
  api_format = 3
737
- force_json = True
738
 
739
  if self.path.endswith('/v1/chat/completions'):
740
  api_format = 4
741
- force_json = True
742
 
743
  if api_format > 0:
744
  genparams = None
@@ -764,8 +785,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
764
  # Headers are already sent when streaming
765
  if not sse_stream_flag:
766
  self.send_response(200)
767
- self.end_headers(force_json=force_json)
768
- self.wfile.write(json.dumps(gen).encode())
769
  except:
770
  print("Generate: The response could not be sent, maybe connection was terminated?")
771
  return
@@ -773,27 +794,23 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
773
  modelbusy.release()
774
 
775
  self.send_response(404)
776
- self.end_headers()
777
 
778
 
779
  def do_OPTIONS(self):
780
  self.send_response(200)
781
- self.end_headers()
782
 
783
  def do_HEAD(self):
784
  self.send_response(200)
785
- self.end_headers()
786
-
787
- def end_headers(self, force_json=False, sse_stream_flag=False):
788
- self.send_header('Access-Control-Allow-Origin', '*')
789
- self.send_header('Access-Control-Allow-Methods', '*')
790
- self.send_header('Access-Control-Allow-Headers', '*')
791
- if ("/api" in self.path and self.path!="/api") or force_json:
792
- if sse_stream_flag:
793
- self.send_header('Content-type', 'text/event-stream')
794
- self.send_header('Content-type', 'application/json')
795
- else:
796
- self.send_header('Content-type', 'text/html')
797
  return super(ServerRequestHandler, self).end_headers()
798
 
799
 
@@ -1017,7 +1034,8 @@ def show_new_gui():
1017
  mmq_var = ctk.IntVar(value=1)
1018
  blas_threads_var = ctk.StringVar()
1019
  blas_size_var = ctk.IntVar()
1020
- version_var =ctk.StringVar(value="0")
 
1021
 
1022
  smartcontext = ctk.IntVar()
1023
  context_var = ctk.IntVar()
@@ -1069,11 +1087,15 @@ def show_new_gui():
1069
  quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
1070
  mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
1071
  quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
 
 
1072
  else:
1073
  lowvram_box.grid_forget()
1074
  quick_lowvram_box.grid_forget()
1075
  mmq_box.grid_forget()
1076
  quick_mmq_box.grid_forget()
 
 
1077
 
1078
  if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
1079
  gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
@@ -1086,6 +1108,7 @@ def show_new_gui():
1086
  quick_gpu_layers_label.grid_forget()
1087
  quick_gpu_layers_entry.grid_forget()
1088
 
 
1089
  # presets selector
1090
  makelabel(quick_tab, "Presets:", 1)
1091
 
@@ -1118,7 +1141,7 @@ def show_new_gui():
1118
  makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
1119
 
1120
  # load model
1121
- makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170,filetypes=[("GGML Model Files", "*.gguf;*.bin;*.ggml")])
1122
 
1123
  # Hardware Tab
1124
  hardware_tab = tabcontent["Hardware"]
@@ -1137,6 +1160,7 @@ def show_new_gui():
1137
  gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
1138
  CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
1139
  gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
 
1140
  lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
1141
  mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
1142
 
@@ -1185,7 +1209,7 @@ def show_new_gui():
1185
  # Model Tab
1186
  model_tab = tabcontent["Model"]
1187
 
1188
- makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1, filetypes=[("GGML Model Files", "*.gguf;*.bin;*.ggml")])
1189
  makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
1190
  makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
1191
 
@@ -1265,6 +1289,12 @@ def show_new_gui():
1265
  args.noavx2 = True
1266
  args.noblas = True
1267
  args.nommap = True
 
 
 
 
 
 
1268
 
1269
  args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
1270
 
@@ -1329,6 +1359,9 @@ def show_new_gui():
1329
  runopts_var.set(openblas_option)
1330
  if "gpulayers" in dict and dict["gpulayers"]:
1331
  gpulayers_var.set(dict["gpulayers"])
 
 
 
1332
  if "blasthreads" in dict and dict["blasthreads"]:
1333
  blas_threads_var.set(str(dict["blasthreads"]))
1334
  else:
@@ -1447,7 +1480,7 @@ def show_gui_msgbox(title,message):
1447
  def run_horde_worker(args, api_key, worker_name):
1448
  import urllib.request
1449
  from datetime import datetime
1450
- global friendlymodelname, maxhordectx, maxhordelen, exitcounter, modelbusy, session_starttime
1451
  epurl = f"http://localhost:{args.port}"
1452
  if args.host!="":
1453
  epurl = f"http://{args.host}:{args.port}"
@@ -1456,10 +1489,11 @@ def run_horde_worker(args, api_key, worker_name):
1456
  print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
1457
 
1458
  def submit_completed_generation(url, jobid, sessionstart, submit_dict):
1459
- global exitcounter, session_kudos_earned, session_jobs
1460
  reply = make_url_request(url, submit_dict)
1461
  if not reply:
1462
  exitcounter += 1
 
1463
  print_with_time(f"Error, Job submit failed.")
1464
  else:
1465
  reward = reply["reward"]
@@ -1473,6 +1507,11 @@ def run_horde_worker(args, api_key, worker_name):
1473
  elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
1474
  earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
1475
  print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
 
 
 
 
 
1476
 
1477
  def make_url_request(url, data, method='POST'):
1478
  try:
@@ -1481,7 +1520,7 @@ def run_horde_worker(args, api_key, worker_name):
1481
  if method=='POST':
1482
  json_payload = json.dumps(data).encode('utf-8')
1483
  request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
1484
- request.add_header('Content-Type', 'application/json')
1485
  else:
1486
  request = urllib.request.Request(url, headers=headers, method=method)
1487
  response_data = ""
@@ -1508,17 +1547,23 @@ def run_horde_worker(args, api_key, worker_name):
1508
  print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
1509
  BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
1510
  cluster = "https://horde.koboldai.net"
1511
- while exitcounter < 10:
1512
  time.sleep(3)
1513
  readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
1514
  if readygo:
1515
  print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
1516
  break
1517
 
1518
- while exitcounter < 10:
1519
  currentjob_attempts = 0
1520
  current_generation = None
1521
 
 
 
 
 
 
 
1522
  #first, make sure we are not generating
1523
  if modelbusy.locked():
1524
  time.sleep(0.2)
@@ -1537,6 +1582,7 @@ def run_horde_worker(args, api_key, worker_name):
1537
  pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
1538
  if not pop:
1539
  exitcounter += 1
 
1540
  print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
1541
  time.sleep(5)
1542
  continue
@@ -1555,7 +1601,7 @@ def run_horde_worker(args, api_key, worker_name):
1555
  print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
1556
 
1557
  #do gen
1558
- while exitcounter < 10:
1559
  if not modelbusy.locked():
1560
  current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
1561
  if current_generation:
@@ -1880,4 +1926,10 @@ if __name__ == '__main__':
1880
  parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
1881
  parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
1882
 
 
 
 
 
 
 
1883
  main(parser.parse_args(),start_server=True)
 
184
  os.add_dll_directory(dir_path)
185
  os.add_dll_directory(abs_path)
186
  os.add_dll_directory(os.getcwd())
187
+ if libname == lib_hipblas and "HIP_PATH" in os.environ:
188
+ os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
189
+ if args.debugmode == 1:
190
+ print(f"HIP/ROCm SDK at {os.environ['HIP_PATH']} included in .DLL load path")
191
  handle = ctypes.CDLL(os.path.join(dir_path, libname))
192
 
193
  handle.load_model.argtypes = [load_model_inputs]
 
365
  modelbusy = threading.Lock()
366
  requestsinqueue = 0
367
  defaultport = 5001
368
+ KcppVersion = "1.47"
369
  showdebug = True
370
  showsamplerwarning = True
371
  showmaxctxwarning = True
 
373
  session_jobs = 0
374
  session_starttime = None
375
  exitcounter = 0
376
+ punishcounter = 0 #causes a timeout if too many errors
377
+ rewardcounter = 0 #reduces error counts for successful jobs
378
  totalgens = 0
379
  currentusergenkey = "" #store a special key so polled streaming works even in multiuser
380
  args = None #global args
 
418
  elif api_format==4:
419
  # translate openai chat completion messages format into one big string.
420
  messages_array = genparams.get('messages', [])
421
+ adapter_obj = genparams.get('adapter', {})
422
  messages_string = ""
423
+ system_message_start = adapter_obj.get("system_start", "\n### Instruction:\n")
424
+ system_message_end = adapter_obj.get("system_end", "")
425
+ user_message_start = adapter_obj.get("user_start", "\n### Instruction:\n")
426
+ user_message_end = adapter_obj.get("user_end", "")
427
+ assistant_message_start = adapter_obj.get("assistant_start", "\n### Response:\n")
428
+ assistant_message_end = adapter_obj.get("assistant_end", "")
429
+
430
  for message in messages_array:
431
  if message['role'] == "system":
432
+ messages_string += system_message_start
433
  elif message['role'] == "user":
434
+ messages_string += user_message_start
435
  elif message['role'] == "assistant":
436
+ messages_string += assistant_message_start
437
+
438
+ messages_string += message['content']
439
+
440
+ if message['role'] == "system":
441
+ messages_string += system_message_end
442
+ elif message['role'] == "user":
443
+ messages_string += user_message_end
444
+ elif message['role'] == "assistant":
445
+ messages_string += assistant_message_end
446
+
447
+ messages_string += assistant_message_start
448
+
449
  genparams["prompt"] = messages_string
450
  frqp = genparams.get('frequency_penalty', 0.1)
451
  scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
 
521
  async def handle_sse_stream(self, api_format):
522
  global friendlymodelname
523
  self.send_response(200)
524
+ self.send_header("cache-control", "no-cache")
525
+ self.send_header("connection", "keep-alive")
526
+ self.end_headers(content_type='text/event-stream')
527
 
528
  current_token = 0
529
  incomplete_token_buffer = bytearray()
 
590
  global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
591
  self.path = self.path.rstrip('/')
592
  response_body = None
593
+ content_type = 'application/json'
594
 
595
  if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
596
+ content_type = 'text/html'
597
  if self.embedded_kailite is None:
598
  response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
599
  else:
 
639
 
640
  elif self.path.endswith('/v1/models'):
641
  response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
 
642
 
643
  elif self.path=="/api":
644
+ content_type = 'text/html'
645
  if self.embedded_kcpp_docs is None:
646
  response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
647
  else:
 
649
  elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
650
  self.path = "/api"
651
  self.send_response(302)
652
+ self.send_header("location", self.path)
653
+ self.end_headers(content_type='text/html')
654
  return None
655
 
656
  if response_body is None:
657
  self.send_response(404)
658
+ self.end_headers(content_type='text/html')
659
  rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
660
  self.wfile.write(rp.encode())
661
  else:
662
  self.send_response(200)
663
+ self.send_header('content-length', str(len(response_body)))
664
+ self.end_headers(content_type=content_type)
665
  self.wfile.write(response_body)
666
  return
667
 
668
  def do_POST(self):
669
  global modelbusy, requestsinqueue, currentusergenkey, totalgens
670
+ content_length = int(self.headers['content-length'])
671
  body = self.rfile.read(content_length)
672
  self.path = self.path.rstrip('/')
 
673
  if self.path.endswith(('/api/extra/tokencount')):
674
  try:
675
  genparams = json.loads(body)
676
  countprompt = genparams.get('prompt', "")
677
  count = handle.token_count(countprompt.encode("UTF-8"))
678
  self.send_response(200)
679
+ self.end_headers(content_type='application/json')
680
  self.wfile.write(json.dumps({"value": count}).encode())
681
 
682
  except ValueError as e:
683
  utfprint("Count Tokens - Body Error: " + str(e))
684
  self.send_response(400)
685
+ self.end_headers(content_type='application/json')
686
  self.wfile.write(json.dumps({"value": -1}).encode())
687
  return
688
 
 
695
  multiuserkey = ""
696
  pass
697
 
698
+ if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
699
  ag = handle.abort_generate()
700
  time.sleep(0.3) #short delay before replying
701
  self.send_response(200)
702
+ self.end_headers(content_type='application/json')
703
  self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
704
  print("\nGeneration Aborted")
705
  else:
 
717
  pass
718
 
719
  if totalgens>0:
720
+ if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
721
  pendtxt = handle.get_pending_output()
722
  pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
723
  self.send_response(200)
724
+ self.end_headers(content_type='application/json')
725
  self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
726
  return
727
 
 
731
  requestsinqueue += 1
732
  if not modelbusy.acquire(blocking=reqblocking):
733
  self.send_response(503)
734
+ self.end_headers(content_type='application/json')
735
  self.wfile.write(json.dumps({"detail": {
736
  "msg": "Server is busy; please try again later.",
737
  "type": "service_unavailable",
 
757
 
758
  if self.path.endswith('/v1/completions'):
759
  api_format = 3
 
760
 
761
  if self.path.endswith('/v1/chat/completions'):
762
  api_format = 4
 
763
 
764
  if api_format > 0:
765
  genparams = None
 
785
  # Headers are already sent when streaming
786
  if not sse_stream_flag:
787
  self.send_response(200)
788
+ self.end_headers(content_type='application/json')
789
+ self.wfile.write(json.dumps(gen).encode())
790
  except:
791
  print("Generate: The response could not be sent, maybe connection was terminated?")
792
  return
 
794
  modelbusy.release()
795
 
796
  self.send_response(404)
797
+ self.end_headers(content_type='text/html')
798
 
799
 
800
  def do_OPTIONS(self):
801
  self.send_response(200)
802
+ self.end_headers(content_type='text/html')
803
 
804
  def do_HEAD(self):
805
  self.send_response(200)
806
+ self.end_headers(content_type='text/html')
807
+
808
+ def end_headers(self, content_type=None):
809
+ self.send_header('access-control-allow-origin', '*')
810
+ self.send_header('access-control-allow-methods', '*')
811
+ self.send_header('access-control-allow-headers', '*, Accept, Content-Type, Content-Length, Accept-Encoding, X-CSRF-Token, Client-Agent, X-Fields, Content-Type, Authorization, X-Requested-With, X-HTTP-Method-Override, apikey, genkey')
812
+ if content_type is not None:
813
+ self.send_header('content-type', content_type)
 
 
 
 
814
  return super(ServerRequestHandler, self).end_headers()
815
 
816
 
 
1034
  mmq_var = ctk.IntVar(value=1)
1035
  blas_threads_var = ctk.StringVar()
1036
  blas_size_var = ctk.IntVar()
1037
+ version_var = ctk.StringVar(value="0")
1038
+ tensor_split_str_vars = ctk.StringVar(value="")
1039
 
1040
  smartcontext = ctk.IntVar()
1041
  context_var = ctk.IntVar()
 
1087
  quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
1088
  mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
1089
  quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
1090
+ tensor_split_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
1091
+ tensor_split_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
1092
  else:
1093
  lowvram_box.grid_forget()
1094
  quick_lowvram_box.grid_forget()
1095
  mmq_box.grid_forget()
1096
  quick_mmq_box.grid_forget()
1097
+ tensor_split_label.grid_forget()
1098
+ tensor_split_entry.grid_forget()
1099
 
1100
  if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
1101
  gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
 
1108
  quick_gpu_layers_label.grid_forget()
1109
  quick_gpu_layers_entry.grid_forget()
1110
 
1111
+
1112
  # presets selector
1113
  makelabel(quick_tab, "Presets:", 1)
1114
 
 
1141
  makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
1142
 
1143
  # load model
1144
+ makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170)
1145
 
1146
  # Hardware Tab
1147
  hardware_tab = tabcontent["Hardware"]
 
1160
  gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
1161
  CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
1162
  gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
1163
+ tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 6, 80)
1164
  lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
1165
  mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
1166
 
 
1209
  # Model Tab
1210
  model_tab = tabcontent["Model"]
1211
 
1212
+ makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1)
1213
  makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
1214
  makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
1215
 
 
1289
  args.noavx2 = True
1290
  args.noblas = True
1291
  args.nommap = True
1292
+ if tensor_split_str_vars.get()!="":
1293
+ tssv = tensor_split_str_vars.get()
1294
+ if "," in tssv:
1295
+ args.tensor_split = [float(x) for x in tssv.split(",")]
1296
+ else:
1297
+ args.tensor_split = [float(x) for x in tssv.split(" ")]
1298
 
1299
  args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
1300
 
 
1359
  runopts_var.set(openblas_option)
1360
  if "gpulayers" in dict and dict["gpulayers"]:
1361
  gpulayers_var.set(dict["gpulayers"])
1362
+ if "tensor_split" in dict and dict["tensor_split"]:
1363
+ tssep = ','.join(map(str, dict["tensor_split"]))
1364
+ tensor_split_str_vars.set(tssep)
1365
  if "blasthreads" in dict and dict["blasthreads"]:
1366
  blas_threads_var.set(str(dict["blasthreads"]))
1367
  else:
 
1480
  def run_horde_worker(args, api_key, worker_name):
1481
  import urllib.request
1482
  from datetime import datetime
1483
+ global friendlymodelname, maxhordectx, maxhordelen, exitcounter, punishcounter, modelbusy, session_starttime
1484
  epurl = f"http://localhost:{args.port}"
1485
  if args.host!="":
1486
  epurl = f"http://{args.host}:{args.port}"
 
1489
  print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
1490
 
1491
  def submit_completed_generation(url, jobid, sessionstart, submit_dict):
1492
+ global exitcounter, punishcounter, session_kudos_earned, session_jobs, rewardcounter
1493
  reply = make_url_request(url, submit_dict)
1494
  if not reply:
1495
  exitcounter += 1
1496
+ punishcounter += 1
1497
  print_with_time(f"Error, Job submit failed.")
1498
  else:
1499
  reward = reply["reward"]
 
1507
  elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
1508
  earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
1509
  print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
1510
+ rewardcounter += 1
1511
+ if rewardcounter > 50:
1512
+ rewardcounter = 0
1513
+ if exitcounter > 5:
1514
+ exitcounter -= 1
1515
 
1516
  def make_url_request(url, data, method='POST'):
1517
  try:
 
1520
  if method=='POST':
1521
  json_payload = json.dumps(data).encode('utf-8')
1522
  request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
1523
+ request.add_header('content-type', 'application/json')
1524
  else:
1525
  request = urllib.request.Request(url, headers=headers, method=method)
1526
  response_data = ""
 
1547
  print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
1548
  BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
1549
  cluster = "https://horde.koboldai.net"
1550
+ while exitcounter < 35:
1551
  time.sleep(3)
1552
  readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
1553
  if readygo:
1554
  print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
1555
  break
1556
 
1557
+ while exitcounter < 35:
1558
  currentjob_attempts = 0
1559
  current_generation = None
1560
 
1561
+ if punishcounter >= 10:
1562
+ punishcounter = 0
1563
+ print_with_time(f"Horde Worker Paused for 10 min - Too many errors. It will resume automatically.")
1564
+ print_with_time(f"Caution: Too many failed jobs may lead to entering maintenance mode.")
1565
+ time.sleep(600)
1566
+
1567
  #first, make sure we are not generating
1568
  if modelbusy.locked():
1569
  time.sleep(0.2)
 
1582
  pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
1583
  if not pop:
1584
  exitcounter += 1
1585
+ punishcounter += 1
1586
  print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
1587
  time.sleep(5)
1588
  continue
 
1601
  print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
1602
 
1603
  #do gen
1604
+ while exitcounter < 35:
1605
  if not modelbusy.locked():
1606
  current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
1607
  if current_generation:
 
1926
  parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
1927
  parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
1928
 
1929
+ # #deprecated hidden args. they do nothing. do not use
1930
+ # parser.add_argument("--psutil_set_threads", action='store_true', help=argparse.SUPPRESS)
1931
+ # parser.add_argument("--stream", action='store_true', help=argparse.SUPPRESS)
1932
+ # parser.add_argument("--unbantokens", action='store_true', help=argparse.SUPPRESS)
1933
+ # parser.add_argument("--usemirostat", action='store_true', help=argparse.SUPPRESS)
1934
+
1935
  main(parser.parse_args(),start_server=True)
llama.cpp CHANGED
@@ -189,6 +189,7 @@ enum llm_arch {
189
  LLM_ARCH_STARCODER,
190
  LLM_ARCH_PERSIMMON,
191
  LLM_ARCH_REFACT,
 
192
  LLM_ARCH_UNKNOWN,
193
  };
194
 
@@ -202,7 +203,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
202
  { LLM_ARCH_BAICHUAN, "baichuan" },
203
  { LLM_ARCH_STARCODER, "starcoder" },
204
  { LLM_ARCH_PERSIMMON, "persimmon" },
205
- { LLM_ARCH_REFACT, "refact" },
 
206
  };
207
 
208
  enum llm_kv {
@@ -305,6 +307,7 @@ struct LLM_KV {
305
 
306
  enum llm_tensor {
307
  LLM_TENSOR_TOKEN_EMBD,
 
308
  LLM_TENSOR_POS_EMBD,
309
  LLM_TENSOR_OUTPUT,
310
  LLM_TENSOR_OUTPUT_NORM,
@@ -425,6 +428,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
425
  LLM_ARCH_MPT,
426
  {
427
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
 
 
 
 
 
 
 
 
428
  },
429
  },
430
  {
@@ -459,6 +470,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
459
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
460
  },
461
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  {
463
  LLM_ARCH_UNKNOWN,
464
  {
@@ -1016,6 +1042,9 @@ struct llama_hparams {
1016
  float rope_freq_base_train;
1017
  float rope_freq_scale_train;
1018
 
 
 
 
1019
  bool operator!=(const llama_hparams & other) const {
1020
  if (this->vocab_only != other.vocab_only) return true;
1021
  if (this->n_vocab != other.n_vocab) return true;
@@ -1201,6 +1230,8 @@ struct llama_model {
1201
 
1202
  struct ggml_tensor * tok_embeddings;
1203
  struct ggml_tensor * pos_embeddings;
 
 
1204
 
1205
  struct ggml_tensor * output_norm;
1206
  struct ggml_tensor * output_norm_b;
@@ -1330,7 +1361,11 @@ static bool llama_kv_cache_init(
1330
  cache.cells.clear();
1331
  cache.cells.resize(n_ctx);
1332
 
 
 
 
1333
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
 
1334
 
1335
  struct ggml_init_params params;
1336
  params.mem_size = cache.buf.size;
@@ -1736,7 +1771,7 @@ struct llama_model_loader {
1736
  }
1737
  }
1738
 
1739
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
1740
  if (backend != GGML_BACKEND_CPU) {
1741
  ggml_set_no_alloc(ctx, true);
1742
  }
@@ -1754,7 +1789,7 @@ struct llama_model_loader {
1754
  return tensor;
1755
  }
1756
 
1757
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
1758
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1759
 
1760
  if (cur == NULL) {
@@ -2047,13 +2082,13 @@ static void llm_load_hparams(
2047
  }
2048
  } break;
2049
  case LLM_ARCH_PERSIMMON:
2050
- {
2051
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2052
- switch (hparams.n_layer) {
2053
- case 36: model.type = e_model::MODEL_8B; break;
2054
- default: model.type = e_model::MODEL_UNKNOWN;
2055
- }
2056
- } break;
2057
  case LLM_ARCH_REFACT:
2058
  {
2059
  GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -2062,6 +2097,33 @@ static void llm_load_hparams(
2062
  default: model.type = e_model::MODEL_UNKNOWN;
2063
  }
2064
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2065
  default: (void)0;
2066
  }
2067
 
@@ -2206,6 +2268,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2206
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2207
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2208
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
 
 
2209
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2210
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2211
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2305,8 +2369,8 @@ static void llm_load_tensors(
2305
 
2306
  // output
2307
  {
2308
- ggml_backend backend_norm;
2309
- ggml_backend backend_output;
2310
 
2311
  if (n_gpu_layers > int(n_layer)) {
2312
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2341,8 +2405,8 @@ static void llm_load_tensors(
2341
  model.layers.resize(n_layer);
2342
 
2343
  for (uint32_t i = 0; i < n_layer; ++i) {
2344
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2345
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2346
 
2347
  auto & layer = model.layers[i];
2348
 
@@ -2371,8 +2435,8 @@ static void llm_load_tensors(
2371
  {
2372
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2373
  {
2374
- ggml_backend backend_norm;
2375
- ggml_backend backend_output;
2376
 
2377
  if (n_gpu_layers > int(n_layer)) {
2378
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2407,8 +2471,8 @@ static void llm_load_tensors(
2407
  model.layers.resize(n_layer);
2408
 
2409
  for (uint32_t i = 0; i < n_layer; ++i) {
2410
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2411
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2412
 
2413
  auto & layer = model.layers[i];
2414
 
@@ -2441,8 +2505,8 @@ static void llm_load_tensors(
2441
 
2442
  // output
2443
  {
2444
- ggml_backend backend_norm;
2445
- ggml_backend backend_output;
2446
 
2447
  if (n_gpu_layers > int(n_layer)) {
2448
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2479,8 +2543,8 @@ static void llm_load_tensors(
2479
  model.layers.resize(n_layer);
2480
 
2481
  for (uint32_t i = 0; i < n_layer; ++i) {
2482
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2483
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2484
 
2485
  auto & layer = model.layers[i];
2486
 
@@ -2518,8 +2582,8 @@ static void llm_load_tensors(
2518
 
2519
  // output
2520
  {
2521
- ggml_backend backend_norm;
2522
- ggml_backend backend_output;
2523
 
2524
  if (n_gpu_layers > int(n_layer)) {
2525
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2556,8 +2620,8 @@ static void llm_load_tensors(
2556
  model.layers.resize(n_layer);
2557
 
2558
  for (uint32_t i = 0; i < n_layer; ++i) {
2559
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2560
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2561
 
2562
  auto & layer = model.layers[i];
2563
 
@@ -2595,8 +2659,8 @@ static void llm_load_tensors(
2595
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2596
 
2597
  {
2598
- ggml_backend backend_norm;
2599
- ggml_backend backend_output;
2600
 
2601
  if (n_gpu_layers > int(n_layer)) {
2602
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2630,8 +2694,8 @@ static void llm_load_tensors(
2630
  const int i_gpu_start = n_layer - n_gpu_layers;
2631
  model.layers.resize(n_layer);
2632
  for (uint32_t i = 0; i < n_layer; ++i) {
2633
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2634
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2635
  auto & layer = model.layers[i];
2636
  layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2637
  layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
@@ -2651,6 +2715,155 @@ static void llm_load_tensors(
2651
  layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2652
  }
2653
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2654
  default:
2655
  throw std::runtime_error("unknown architecture");
2656
  }
@@ -4507,7 +4720,6 @@ static struct ggml_cgraph * llm_build_starcoder(
4507
  return gf;
4508
  }
4509
 
4510
-
4511
  static struct ggml_cgraph * llm_build_persimmon(
4512
  llama_context & lctx,
4513
  const llama_batch & batch) {
@@ -4905,37 +5117,604 @@ static struct ggml_cgraph * llm_build_persimmon(
4905
  return gf;
4906
  }
4907
 
4908
- static struct ggml_cgraph * llama_build_graph(
4909
  llama_context & lctx,
4910
  const llama_batch & batch) {
4911
- const auto & model = lctx.model;
 
 
4912
 
4913
- struct ggml_cgraph * result = NULL;
4914
 
4915
- switch (model.arch) {
4916
- case LLM_ARCH_LLAMA:
4917
- {
4918
- result = llm_build_llama(lctx, batch);
4919
- } break;
4920
- case LLM_ARCH_BAICHUAN:
4921
- {
4922
- result = llm_build_baichaun(lctx, batch);
4923
- } break;
4924
- case LLM_ARCH_FALCON:
4925
- {
4926
- result = llm_build_falcon(lctx, batch);
4927
- } break;
4928
- case LLM_ARCH_STARCODER:
4929
- {
4930
- result = llm_build_starcoder(lctx, batch);
4931
- } break;
4932
- case LLM_ARCH_PERSIMMON:
4933
- {
4934
- result = llm_build_persimmon(lctx, batch);
4935
- } break;
4936
- case LLM_ARCH_REFACT:
4937
- {
4938
- result = llm_build_refact(lctx, batch);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4939
  } break;
4940
  default:
4941
  GGML_ASSERT(false);
@@ -5067,7 +5846,8 @@ static int llama_decode_internal(
5067
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
5068
  model.arch == LLM_ARCH_BAICHUAN ||
5069
  model.arch == LLM_ARCH_FALCON ||
5070
- model.arch == LLM_ARCH_REFACT;
 
5071
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5072
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5073
  n_threads = 1;
@@ -5568,7 +6348,6 @@ private:
5568
  for (int i = 0; i < (int)text_utf.size(); i++) {
5569
  const std::string & utf_char = text_utf[i];
5570
  bool split_condition = false;
5571
- // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5572
  int bytes_remain = text_utf.size() - i;
5573
  // forward backward lookups
5574
  const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -5594,9 +6373,9 @@ private:
5594
  if (!split_condition && bytes_remain >= 3) {
5595
  // 're|'ve|'ll
5596
  if (utf_char == "\'" && (
5597
- (utf_char_next == "r" || utf_char_next_next == "e") ||
5598
- (utf_char_next == "v" || utf_char_next_next == "e") ||
5599
- (utf_char_next == "l" || utf_char_next_next == "l"))
5600
  ) {
5601
  split_condition = true;
5602
  }
@@ -5647,7 +6426,7 @@ private:
5647
  else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5648
  split_condition = true;
5649
  }
5650
- else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
5651
  split_condition = true;
5652
  }
5653
  }
@@ -7166,7 +7945,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7166
  const std::string name = ggml_get_name(meta);
7167
 
7168
  // TODO: avoid hardcoded tensor names - use the TN_* constants
7169
- if (name.find("attn_v.weight") != std::string::npos) {
7170
  ++n_attention_wv;
7171
  }
7172
  else if (name.find("ffn_down.weight") != std::string::npos) {
 
189
  LLM_ARCH_STARCODER,
190
  LLM_ARCH_PERSIMMON,
191
  LLM_ARCH_REFACT,
192
+ LLM_ARCH_BLOOM,
193
  LLM_ARCH_UNKNOWN,
194
  };
195
 
 
203
  { LLM_ARCH_BAICHUAN, "baichuan" },
204
  { LLM_ARCH_STARCODER, "starcoder" },
205
  { LLM_ARCH_PERSIMMON, "persimmon" },
206
+ { LLM_ARCH_REFACT, "refact" },
207
+ { LLM_ARCH_BLOOM, "bloom" },
208
  };
209
 
210
  enum llm_kv {
 
307
 
308
  enum llm_tensor {
309
  LLM_TENSOR_TOKEN_EMBD,
310
+ LLM_TENSOR_TOKEN_EMBD_NORM,
311
  LLM_TENSOR_POS_EMBD,
312
  LLM_TENSOR_OUTPUT,
313
  LLM_TENSOR_OUTPUT_NORM,
 
428
  LLM_ARCH_MPT,
429
  {
430
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
431
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
432
+ { LLM_TENSOR_OUTPUT, "output" },
433
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
434
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
435
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
436
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
437
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
438
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
439
  },
440
  },
441
  {
 
470
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
471
  },
472
  },
473
+ {
474
+ LLM_ARCH_BLOOM,
475
+ {
476
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
477
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
478
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
479
+ { LLM_TENSOR_OUTPUT, "output" },
480
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
481
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
482
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
483
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
484
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
485
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
486
+ },
487
+ },
488
  {
489
  LLM_ARCH_UNKNOWN,
490
  {
 
1042
  float rope_freq_base_train;
1043
  float rope_freq_scale_train;
1044
 
1045
+ float f_clamp_kqv;
1046
+ float f_max_alibi_bias;
1047
+
1048
  bool operator!=(const llama_hparams & other) const {
1049
  if (this->vocab_only != other.vocab_only) return true;
1050
  if (this->n_vocab != other.n_vocab) return true;
 
1230
 
1231
  struct ggml_tensor * tok_embeddings;
1232
  struct ggml_tensor * pos_embeddings;
1233
+ struct ggml_tensor * tok_norm;
1234
+ struct ggml_tensor * tok_norm_b;
1235
 
1236
  struct ggml_tensor * output_norm;
1237
  struct ggml_tensor * output_norm_b;
 
1361
  cache.cells.clear();
1362
  cache.cells.resize(n_ctx);
1363
 
1364
+ // TODO: this should be:
1365
+ // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1366
+ // change it and test that it works
1367
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1368
+ memset(cache.buf.data, 0, cache.buf.size);
1369
 
1370
  struct ggml_init_params params;
1371
  params.mem_size = cache.buf.size;
 
1771
  }
1772
  }
1773
 
1774
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
1775
  if (backend != GGML_BACKEND_CPU) {
1776
  ggml_set_no_alloc(ctx, true);
1777
  }
 
1789
  return tensor;
1790
  }
1791
 
1792
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
1793
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1794
 
1795
  if (cur == NULL) {
 
2082
  }
2083
  } break;
2084
  case LLM_ARCH_PERSIMMON:
2085
+ {
2086
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2087
+ switch (hparams.n_layer) {
2088
+ case 36: model.type = e_model::MODEL_8B; break;
2089
+ default: model.type = e_model::MODEL_UNKNOWN;
2090
+ }
2091
+ } break;
2092
  case LLM_ARCH_REFACT:
2093
  {
2094
  GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
 
2097
  default: model.type = e_model::MODEL_UNKNOWN;
2098
  }
2099
  } break;
2100
+ case LLM_ARCH_BLOOM:
2101
+ {
2102
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2103
+
2104
+ switch (hparams.n_layer) {
2105
+ case 24: model.type = e_model::MODEL_1B; break;
2106
+ case 30:
2107
+ switch (hparams.n_embd) {
2108
+ case 2560: model.type = e_model::MODEL_3B; break;
2109
+ case 4096: model.type = e_model::MODEL_7B; break;
2110
+ } break;
2111
+ }
2112
+ } break;
2113
+ case LLM_ARCH_MPT:
2114
+ {
2115
+ hparams.f_clamp_kqv = 0.0f;
2116
+
2117
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2118
+ GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2119
+ GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2120
+
2121
+ switch (hparams.n_layer) {
2122
+ case 32: model.type = e_model::MODEL_7B; break;
2123
+ case 48: model.type = e_model::MODEL_30B; break;
2124
+ default: model.type = e_model::MODEL_UNKNOWN;
2125
+ }
2126
+ } break;
2127
  default: (void)0;
2128
  }
2129
 
 
2268
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2269
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2270
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2271
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2272
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2273
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2274
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2275
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
 
2369
 
2370
  // output
2371
  {
2372
+ ggml_backend_type backend_norm;
2373
+ ggml_backend_type backend_output;
2374
 
2375
  if (n_gpu_layers > int(n_layer)) {
2376
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2405
  model.layers.resize(n_layer);
2406
 
2407
  for (uint32_t i = 0; i < n_layer; ++i) {
2408
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2409
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2410
 
2411
  auto & layer = model.layers[i];
2412
 
 
2435
  {
2436
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2437
  {
2438
+ ggml_backend_type backend_norm;
2439
+ ggml_backend_type backend_output;
2440
 
2441
  if (n_gpu_layers > int(n_layer)) {
2442
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2471
  model.layers.resize(n_layer);
2472
 
2473
  for (uint32_t i = 0; i < n_layer; ++i) {
2474
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2475
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2476
 
2477
  auto & layer = model.layers[i];
2478
 
 
2505
 
2506
  // output
2507
  {
2508
+ ggml_backend_type backend_norm;
2509
+ ggml_backend_type backend_output;
2510
 
2511
  if (n_gpu_layers > int(n_layer)) {
2512
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2543
  model.layers.resize(n_layer);
2544
 
2545
  for (uint32_t i = 0; i < n_layer; ++i) {
2546
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2547
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2548
 
2549
  auto & layer = model.layers[i];
2550
 
 
2582
 
2583
  // output
2584
  {
2585
+ ggml_backend_type backend_norm;
2586
+ ggml_backend_type backend_output;
2587
 
2588
  if (n_gpu_layers > int(n_layer)) {
2589
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2620
  model.layers.resize(n_layer);
2621
 
2622
  for (uint32_t i = 0; i < n_layer; ++i) {
2623
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2624
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2625
 
2626
  auto & layer = model.layers[i];
2627
 
 
2659
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2660
 
2661
  {
2662
+ ggml_backend_type backend_norm;
2663
+ ggml_backend_type backend_output;
2664
 
2665
  if (n_gpu_layers > int(n_layer)) {
2666
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2694
  const int i_gpu_start = n_layer - n_gpu_layers;
2695
  model.layers.resize(n_layer);
2696
  for (uint32_t i = 0; i < n_layer; ++i) {
2697
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2698
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2699
  auto & layer = model.layers[i];
2700
  layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2701
  layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
 
2715
  layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2716
  }
2717
  } break;
2718
+ case LLM_ARCH_BLOOM:
2719
+ {
2720
+ // TODO: CPU-only for now
2721
+
2722
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2723
+ model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2724
+ model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2725
+
2726
+ // output
2727
+ {
2728
+ ggml_backend_type backend_norm;
2729
+ ggml_backend_type backend_output;
2730
+
2731
+ if (n_gpu_layers > int(n_layer)) {
2732
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2733
+ // on Windows however this is detrimental unless everything is on the GPU
2734
+ #ifndef _WIN32
2735
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2736
+ #else
2737
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2738
+ #endif // _WIN32
2739
+
2740
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2741
+ } else {
2742
+ backend_norm = GGML_BACKEND_CPU;
2743
+ backend_output = GGML_BACKEND_CPU;
2744
+ }
2745
+
2746
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2747
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2748
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2749
+
2750
+ if (backend_norm == GGML_BACKEND_GPU) {
2751
+ vram_weights += ggml_nbytes(model.output_norm);
2752
+ vram_weights += ggml_nbytes(model.output_norm_b);
2753
+ }
2754
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2755
+ vram_weights += ggml_nbytes(model.output);
2756
+ }
2757
+ }
2758
+
2759
+ const uint32_t n_ff = hparams.n_ff;
2760
+
2761
+ const int i_gpu_start = n_layer - n_gpu_layers;
2762
+
2763
+ model.layers.resize(n_layer);
2764
+
2765
+ for (uint32_t i = 0; i < n_layer; ++i) {
2766
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2767
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2768
+
2769
+ auto & layer = model.layers[i];
2770
+
2771
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2772
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2773
+
2774
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2775
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2776
+
2777
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2778
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2779
+
2780
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2781
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2782
+
2783
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2784
+ layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2785
+
2786
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2787
+ layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2788
+
2789
+ if (backend == GGML_BACKEND_GPU) {
2790
+ vram_weights +=
2791
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2792
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2793
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2794
+ ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2795
+ ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
2796
+ ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
2797
+ }
2798
+ }
2799
+ } break;
2800
+ case LLM_ARCH_MPT:
2801
+ {
2802
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2803
+
2804
+ // output
2805
+ {
2806
+ ggml_backend_type backend_norm;
2807
+ ggml_backend_type backend_output;
2808
+
2809
+ if (n_gpu_layers > int(n_layer)) {
2810
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2811
+ // on Windows however this is detrimental unless everything is on the GPU
2812
+ #ifndef _WIN32
2813
+ backend_norm = LLAMA_BACKEND_OFFLOAD;
2814
+ #else
2815
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2816
+ #endif // _WIN32
2817
+
2818
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2819
+ } else {
2820
+ backend_norm = GGML_BACKEND_CPU;
2821
+ backend_output = GGML_BACKEND_CPU;
2822
+ }
2823
+
2824
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2825
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2826
+
2827
+ if (backend_norm == GGML_BACKEND_GPU) {
2828
+ vram_weights += ggml_nbytes(model.output_norm);
2829
+ }
2830
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2831
+ vram_weights += ggml_nbytes(model.output);
2832
+ }
2833
+ }
2834
+
2835
+ const uint32_t n_ff = hparams.n_ff;
2836
+
2837
+ const int i_gpu_start = n_layer - n_gpu_layers;
2838
+
2839
+ model.layers.resize(n_layer);
2840
+
2841
+ for (uint32_t i = 0; i < n_layer; ++i) {
2842
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2843
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2844
+
2845
+ auto & layer = model.layers[i];
2846
+
2847
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2848
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
2849
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2850
+
2851
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2852
+
2853
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2854
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2855
+
2856
+ if (backend == GGML_BACKEND_GPU) {
2857
+ vram_weights +=
2858
+ ggml_nbytes(layer.attn_norm) +
2859
+ ggml_nbytes(layer.wqkv) +
2860
+ ggml_nbytes(layer.wo) +
2861
+ ggml_nbytes(layer.ffn_norm) +
2862
+ ggml_nbytes(layer.w2) +
2863
+ ggml_nbytes(layer.w3);
2864
+ }
2865
+ }
2866
+ } break;
2867
  default:
2868
  throw std::runtime_error("unknown architecture");
2869
  }
 
4720
  return gf;
4721
  }
4722
 
 
4723
  static struct ggml_cgraph * llm_build_persimmon(
4724
  llama_context & lctx,
4725
  const llama_batch & batch) {
 
5117
  return gf;
5118
  }
5119
 
5120
+ static struct ggml_cgraph * llm_build_bloom(
5121
  llama_context & lctx,
5122
  const llama_batch & batch) {
5123
+ const auto & model = lctx.model;
5124
+ const auto & hparams = model.hparams;
5125
+ const auto & cparams = lctx.cparams;
5126
 
5127
+ const auto & kv_self = lctx.kv_self;
5128
 
5129
+ GGML_ASSERT(!!kv_self.ctx);
5130
+
5131
+ const int64_t n_embd = hparams.n_embd;
5132
+ const int64_t n_layer = hparams.n_layer;
5133
+ const int64_t n_ctx = cparams.n_ctx;
5134
+ const int64_t n_head = hparams.n_head;
5135
+ const int64_t n_head_kv = hparams.n_head_kv;
5136
+ const int64_t n_embd_head = hparams.n_embd_head();
5137
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5138
+
5139
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5140
+
5141
+ const float norm_eps = hparams.f_norm_eps;
5142
+
5143
+ const int32_t n_tokens = batch.n_tokens;
5144
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5145
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5146
+
5147
+ auto & buf_compute = lctx.buf_compute;
5148
+
5149
+ struct ggml_init_params params = {
5150
+ /*.mem_size =*/ buf_compute.size,
5151
+ /*.mem_buffer =*/ buf_compute.data,
5152
+ /*.no_alloc =*/ false,
5153
+ };
5154
+
5155
+ params.no_alloc = true;
5156
+
5157
+ struct ggml_context * ctx0 = ggml_init(params);
5158
+
5159
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5160
+
5161
+ struct ggml_tensor * cur;
5162
+ struct ggml_tensor * token;
5163
+ struct ggml_tensor * inpL;
5164
+
5165
+ if (batch.token) {
5166
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5167
+
5168
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5169
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5170
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5171
+ }
5172
+ ggml_set_name(inp_tokens, "inp_tokens");
5173
+
5174
+ token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5175
+ } else {
5176
+ #ifdef GGML_USE_MPI
5177
+ GGML_ASSERT(false && "not implemented");
5178
+ #endif
5179
+
5180
+ token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5181
+
5182
+ ggml_allocr_alloc(lctx.alloc, token);
5183
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5184
+ memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
5185
+ }
5186
+ }
5187
+
5188
+ // KQ_scale
5189
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5190
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5191
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5192
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5193
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5194
+ }
5195
+
5196
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5197
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5198
+ ggml_set_name(KQ_mask, "KQ_mask");
5199
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5200
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5201
+ float * data = (float *) KQ_mask->data;
5202
+ memset(data, 0, ggml_nbytes(KQ_mask));
5203
+
5204
+ for (int h = 0; h < 1; ++h) {
5205
+ for (int j = 0; j < n_tokens; ++j) {
5206
+ const llama_pos pos = batch.pos[j];
5207
+ const llama_seq_id seq_id = batch.seq_id[j];
5208
+
5209
+ for (int i = 0; i < n_kv; ++i) {
5210
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5211
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5212
+ }
5213
+ }
5214
+ }
5215
+ }
5216
+ }
5217
+
5218
+ // norm
5219
+ {
5220
+ inpL = ggml_norm(ctx0, token, norm_eps);
5221
+ inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
5222
+ }
5223
+
5224
+ ggml_set_name(inpL, "inpL");
5225
+
5226
+ for (int il = 0; il < n_layer; ++il) {
5227
+ {
5228
+ // Norm
5229
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5230
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5231
+ }
5232
+
5233
+ {
5234
+ // Self Attention
5235
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5236
+
5237
+ struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
5238
+ struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
5239
+ struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5240
+
5241
+ struct ggml_tensor * Qcur = tmpq;
5242
+ struct ggml_tensor * Kcur = tmpk;
5243
+
5244
+ // store key and value to memory
5245
+ {
5246
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5247
+ ggml_set_name(Vcur, "Vcur");
5248
+
5249
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5250
+ ggml_set_name(k, "k");
5251
+
5252
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5253
+ ( n_ctx)*ggml_element_size(kv_self.v),
5254
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5255
+
5256
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5257
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5258
+ }
5259
+
5260
+ struct ggml_tensor * Q =
5261
+ ggml_permute(ctx0,
5262
+ ggml_cpy(ctx0,
5263
+ Qcur,
5264
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
5265
+ 0, 2, 1, 3);
5266
+ ggml_set_name(Q, "Q");
5267
+
5268
+ struct ggml_tensor * K =
5269
+ ggml_view_3d(ctx0, kv_self.k,
5270
+ n_embd_head, n_kv, n_head_kv,
5271
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5272
+ ggml_element_size(kv_self.k)*n_embd_head,
5273
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5274
+ ggml_set_name(K, "K");
5275
+
5276
+ // K * Q
5277
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5278
+ ggml_set_name(KQ, "KQ");
5279
+
5280
+ // KQ_scaled = KQ / sqrt(n_embd_head)
5281
+ // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
5282
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5283
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5284
+
5285
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
5286
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5287
+
5288
+ // KQ_masked = mask_past(KQ_scaled)
5289
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5290
+ ggml_set_name(KQ_masked, "KQ_masked");
5291
+
5292
+ // KQ = soft_max(KQ_masked)
5293
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5294
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5295
+
5296
+ // split cached V into n_head heads
5297
+ struct ggml_tensor * V =
5298
+ ggml_view_3d(ctx0, kv_self.v,
5299
+ n_kv, n_embd_head, n_head_kv,
5300
+ ggml_element_size(kv_self.v)*n_ctx,
5301
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5302
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5303
+ ggml_set_name(V, "V");
5304
+
5305
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5306
+ ggml_set_name(KQV, "KQV");
5307
+
5308
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
5309
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5310
+ ggml_set_name(KQV_merged, "KQV_merged");
5311
+
5312
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
5313
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5314
+ ggml_set_name(cur, "KQV_merged_contiguous");
5315
+ }
5316
+
5317
+ // Projection
5318
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5319
+
5320
+ // Add the input
5321
+ cur = ggml_add(ctx0, cur, inpL);
5322
+
5323
+ struct ggml_tensor * inpFF = cur;
5324
+
5325
+ // FF
5326
+ {
5327
+ // Norm
5328
+ {
5329
+ cur = ggml_norm(ctx0, inpFF, norm_eps);
5330
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5331
+ }
5332
+
5333
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5334
+
5335
+ // GELU activation
5336
+ cur = ggml_gelu(ctx0, cur);
5337
+
5338
+ // Projection
5339
+ cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5340
+ }
5341
+
5342
+ inpL = ggml_add(ctx0, cur, inpFF);
5343
+ }
5344
+
5345
+ // Output Norm
5346
+ {
5347
+ cur = ggml_norm(ctx0, inpL, norm_eps);
5348
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5349
+ }
5350
+ ggml_set_name(cur, "result_norm");
5351
+
5352
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5353
+ ggml_set_name(cur, "result_output");
5354
+
5355
+ ggml_build_forward_expand(gf, cur);
5356
+
5357
+ ggml_free(ctx0);
5358
+
5359
+ return gf;
5360
+ }
5361
+
5362
+ static struct ggml_cgraph * llm_build_mpt(
5363
+ llama_context & lctx,
5364
+ const llama_batch & batch) {
5365
+ const auto & model = lctx.model;
5366
+ const auto & hparams = model.hparams;
5367
+ const auto & cparams = lctx.cparams;
5368
+
5369
+ const auto & kv_self = lctx.kv_self;
5370
+
5371
+ GGML_ASSERT(!!kv_self.ctx);
5372
+
5373
+ const int64_t n_embd = hparams.n_embd;
5374
+ const int64_t n_layer = hparams.n_layer;
5375
+ const int64_t n_ctx = cparams.n_ctx;
5376
+ const int64_t n_head = hparams.n_head;
5377
+ const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
5378
+ const int64_t n_embd_head = hparams.n_embd_head();
5379
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
5380
+
5381
+ const float norm_eps = hparams.f_norm_eps;
5382
+ const float clamp_kqv = hparams.f_clamp_kqv;
5383
+ const float max_alibi_bias = hparams.f_max_alibi_bias;
5384
+
5385
+ const int n_gpu_layers = model.n_gpu_layers;
5386
+
5387
+ const int32_t n_tokens = batch.n_tokens;
5388
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5389
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5390
+
5391
+ auto & buf_compute = lctx.buf_compute;
5392
+
5393
+ struct ggml_init_params params = {
5394
+ /*.mem_size =*/ buf_compute.size,
5395
+ /*.mem_buffer =*/ buf_compute.data,
5396
+ /*.no_alloc =*/ false,
5397
+ };
5398
+
5399
+ params.no_alloc = true;
5400
+
5401
+ struct ggml_context * ctx0 = ggml_init(params);
5402
+
5403
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
5404
+
5405
+ struct ggml_tensor * cur;
5406
+ struct ggml_tensor * inpL;
5407
+
5408
+ //int warmup = 0;
5409
+ if (batch.token) {
5410
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5411
+
5412
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
5413
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5414
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5415
+ //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
5416
+ }
5417
+
5418
+ ggml_set_name(inp_tokens, "inp_tokens");
5419
+
5420
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5421
+ } else {
5422
+ #ifdef GGML_USE_MPI
5423
+ GGML_ASSERT(false && "not implemented");
5424
+ #endif
5425
+
5426
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5427
+
5428
+ ggml_allocr_alloc(lctx.alloc, inpL);
5429
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5430
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
5431
+ }
5432
+ }
5433
+
5434
+ const int i_gpu_start = n_layer - n_gpu_layers;
5435
+ (void) i_gpu_start;
5436
+
5437
+ // offload functions set the tensor output backend to GPU
5438
+ // tensors are GPU-accelerated if any input or the output has been offloaded
5439
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
5440
+ offload_func_t offload_func_kq = llama_nop;
5441
+ offload_func_t offload_func_v = llama_nop;
5442
+
5443
+ #ifdef GGML_USE_CUBLAS
5444
+ if (n_gpu_layers > n_layer) {
5445
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
5446
+ }
5447
+ if (n_gpu_layers > n_layer + 1) {
5448
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
5449
+ }
5450
+ if (n_gpu_layers > n_layer + 2) {
5451
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
5452
+ }
5453
+ #endif // GGML_USE_CUBLAS
5454
+
5455
+ // KQ_scale
5456
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5457
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5458
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
5459
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5460
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5461
+ }
5462
+
5463
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5464
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5465
+ offload_func_kq(KQ_mask);
5466
+ ggml_set_name(KQ_mask, "KQ_mask");
5467
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
5468
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
5469
+ float * data = (float *) KQ_mask->data;
5470
+ memset(data, 0, ggml_nbytes(KQ_mask));
5471
+
5472
+ for (int h = 0; h < 1; ++h) {
5473
+ for (int j = 0; j < n_tokens; ++j) {
5474
+ const llama_pos pos = batch.pos[j];
5475
+ const llama_seq_id seq_id = batch.seq_id[j];
5476
+
5477
+ for (int i = 0; i < n_kv; ++i) {
5478
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5479
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5480
+ }
5481
+ }
5482
+ }
5483
+ }
5484
+ }
5485
+
5486
+ for (int il = 0; il < n_layer; ++il) {
5487
+ struct ggml_tensor * attn_norm;
5488
+
5489
+ offload_func_t offload_func = llama_nop;
5490
+
5491
+ #ifdef GGML_USE_CUBLAS
5492
+ if (il >= i_gpu_start) {
5493
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
5494
+ }
5495
+ #endif // GGML_USE_CUBLAS
5496
+
5497
+ // self-attention
5498
+ // TODO: refactor into common function (shared with LLaMA)
5499
+ {
5500
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
5501
+ offload_func(attn_norm);
5502
+
5503
+ attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
5504
+ offload_func(attn_norm);
5505
+
5506
+ if (1) {
5507
+ cur = attn_norm;
5508
+ }
5509
+
5510
+ // compute QKV
5511
+
5512
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5513
+ offload_func_kq(cur);
5514
+
5515
+ if (clamp_kqv > 0.0f) {
5516
+ cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
5517
+ offload_func_kq(cur);
5518
+ }
5519
+
5520
+ const size_t wsize = ggml_type_size(cur->type);
5521
+
5522
+ struct ggml_tensor * Qcur = ggml_view_3d(
5523
+ ctx0, cur, n_embd_head, n_head, n_tokens,
5524
+ wsize * n_embd_head,
5525
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5526
+ 0);
5527
+ offload_func_kq(Qcur);
5528
+
5529
+ struct ggml_tensor * Kcur = ggml_view_3d(
5530
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5531
+ wsize * n_embd_head,
5532
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5533
+ wsize * n_embd_head * n_head);
5534
+ offload_func_kq(Kcur);
5535
+
5536
+ struct ggml_tensor * tmpv = ggml_view_3d(
5537
+ ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5538
+ wsize * n_embd_head,
5539
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
5540
+ wsize * n_embd_head * (n_head + n_head_kv));
5541
+ offload_func_kq(Kcur);
5542
+
5543
+ ggml_set_name(Qcur, "Qcur");
5544
+ ggml_set_name(Kcur, "Kcur");
5545
+
5546
+ {
5547
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5548
+ offload_func_v(Vcur);
5549
+ offload_func_v(Vcur->src[0]->src[0]);
5550
+ ggml_set_name(Vcur, "Vcur");
5551
+
5552
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5553
+ offload_func_kq(k);
5554
+ ggml_set_name(k, "k");
5555
+
5556
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5557
+ ( n_ctx)*ggml_element_size(kv_self.v),
5558
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5559
+ offload_func_v(v);
5560
+
5561
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5562
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5563
+ }
5564
+
5565
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
5566
+ offload_func_kq(Q);
5567
+ ggml_set_name(Q, "Q");
5568
+
5569
+ struct ggml_tensor * K =
5570
+ ggml_view_3d(ctx0, kv_self.k,
5571
+ n_embd_head, n_kv, n_head_kv,
5572
+ ggml_element_size(kv_self.k)*n_embd_gqa,
5573
+ ggml_element_size(kv_self.k)*n_embd_head,
5574
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5575
+ offload_func_kq(K);
5576
+ ggml_set_name(K, "K");
5577
+
5578
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5579
+ offload_func_kq(KQ);
5580
+ ggml_set_name(KQ, "KQ");
5581
+
5582
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5583
+ offload_func_kq(KQ_scaled);
5584
+ ggml_set_name(KQ_scaled, "KQ_scaled");
5585
+
5586
+ // TODO: replace with ggml_add()
5587
+ struct ggml_tensor * KQ_scaled_alibi =
5588
+ ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
5589
+ offload_func_kq(KQ_scaled_alibi);
5590
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5591
+
5592
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5593
+ offload_func_kq(KQ_masked);
5594
+ ggml_set_name(KQ_masked, "KQ_masked");
5595
+
5596
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
5597
+ offload_func_v(KQ_soft_max);
5598
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
5599
+
5600
+ struct ggml_tensor * V =
5601
+ ggml_view_3d(ctx0, kv_self.v,
5602
+ n_kv, n_embd_head, n_head_kv,
5603
+ ggml_element_size(kv_self.v)*n_ctx,
5604
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5605
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5606
+ offload_func_v(V);
5607
+ ggml_set_name(V, "V");
5608
+
5609
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5610
+ offload_func_v(KQV);
5611
+ ggml_set_name(KQV, "KQV");
5612
+
5613
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5614
+ offload_func_v(KQV_merged);
5615
+ ggml_set_name(KQV_merged, "KQV_merged");
5616
+
5617
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5618
+ offload_func_v(cur);
5619
+ ggml_set_name(cur, "KQV_merged_contiguous");
5620
+
5621
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5622
+ offload_func(cur);
5623
+ ggml_set_name(cur, "result_wo");
5624
+ }
5625
+
5626
+ // Add the input
5627
+ cur = ggml_add(ctx0, cur, inpL);
5628
+ offload_func(cur);
5629
+
5630
+ struct ggml_tensor * attn_out = cur;
5631
+
5632
+ // feed forward
5633
+ {
5634
+ // Norm
5635
+ {
5636
+ cur = ggml_norm(ctx0, attn_out, norm_eps);
5637
+ offload_func(cur);
5638
+
5639
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
5640
+ offload_func(cur);
5641
+ }
5642
+
5643
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5644
+ offload_func(cur);
5645
+
5646
+ cur = ggml_gelu(ctx0, cur);
5647
+ offload_func(cur);
5648
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5649
+ offload_func(cur);
5650
+ }
5651
+
5652
+ cur = ggml_add(ctx0, cur, attn_out);
5653
+ offload_func(cur);
5654
+ // input for next layer
5655
+ inpL = cur;
5656
+ }
5657
+
5658
+ cur = inpL;
5659
+
5660
+ // norm
5661
+ {
5662
+ cur = ggml_norm(ctx0, cur, norm_eps);
5663
+ offload_func_nr(cur);
5664
+
5665
+ cur = ggml_mul(ctx0, cur, model.output_norm);
5666
+ ggml_set_name(cur, "result_norm");
5667
+ }
5668
+
5669
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5670
+ ggml_set_name(cur, "result_output");
5671
+
5672
+ ggml_build_forward_expand(gf, cur);
5673
+
5674
+ ggml_free(ctx0);
5675
+
5676
+ return gf;
5677
+ }
5678
+
5679
+ static struct ggml_cgraph * llama_build_graph(
5680
+ llama_context & lctx,
5681
+ const llama_batch & batch) {
5682
+ const auto & model = lctx.model;
5683
+
5684
+ struct ggml_cgraph * result = NULL;
5685
+
5686
+ switch (model.arch) {
5687
+ case LLM_ARCH_LLAMA:
5688
+ {
5689
+ result = llm_build_llama(lctx, batch);
5690
+ } break;
5691
+ case LLM_ARCH_BAICHUAN:
5692
+ {
5693
+ result = llm_build_baichaun(lctx, batch);
5694
+ } break;
5695
+ case LLM_ARCH_FALCON:
5696
+ {
5697
+ result = llm_build_falcon(lctx, batch);
5698
+ } break;
5699
+ case LLM_ARCH_STARCODER:
5700
+ {
5701
+ result = llm_build_starcoder(lctx, batch);
5702
+ } break;
5703
+ case LLM_ARCH_PERSIMMON:
5704
+ {
5705
+ result = llm_build_persimmon(lctx, batch);
5706
+ } break;
5707
+ case LLM_ARCH_REFACT:
5708
+ {
5709
+ result = llm_build_refact(lctx, batch);
5710
+ } break;
5711
+ case LLM_ARCH_BLOOM:
5712
+ {
5713
+ result = llm_build_bloom(lctx, batch);
5714
+ } break;
5715
+ case LLM_ARCH_MPT:
5716
+ {
5717
+ result = llm_build_mpt(lctx, batch);
5718
  } break;
5719
  default:
5720
  GGML_ASSERT(false);
 
5846
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
5847
  model.arch == LLM_ARCH_BAICHUAN ||
5848
  model.arch == LLM_ARCH_FALCON ||
5849
+ model.arch == LLM_ARCH_REFACT ||
5850
+ model.arch == LLM_ARCH_MPT;
5851
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5852
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5853
  n_threads = 1;
 
6348
  for (int i = 0; i < (int)text_utf.size(); i++) {
6349
  const std::string & utf_char = text_utf[i];
6350
  bool split_condition = false;
 
6351
  int bytes_remain = text_utf.size() - i;
6352
  // forward backward lookups
6353
  const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
 
6373
  if (!split_condition && bytes_remain >= 3) {
6374
  // 're|'ve|'ll
6375
  if (utf_char == "\'" && (
6376
+ (utf_char_next == "r" && utf_char_next_next == "e") ||
6377
+ (utf_char_next == "v" && utf_char_next_next == "e") ||
6378
+ (utf_char_next == "l" && utf_char_next_next == "l"))
6379
  ) {
6380
  split_condition = true;
6381
  }
 
6426
  else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
6427
  split_condition = true;
6428
  }
6429
+ else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
6430
  split_condition = true;
6431
  }
6432
  }
 
7945
  const std::string name = ggml_get_name(meta);
7946
 
7947
  // TODO: avoid hardcoded tensor names - use the TN_* constants
7948
+ if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
7949
  ++n_attention_wv;
7950
  }
7951
  else if (name.find("ffn_down.weight") != std::string::npos) {
otherarch/llama_v3.cpp CHANGED
@@ -63,9 +63,8 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
63
  #define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
64
  #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
65
 
66
-
67
- #if !defined(GGML_USE_CUBLAS)
68
  #include "ggml-alloc.h"
 
69
  #define LLAMA_V3_USE_ALLOCATOR
70
  #else
71
  #define LLAMA_V3_USE_SCRATCH
@@ -725,7 +724,7 @@ struct llama_v3_model_loader {
725
  }
726
  }
727
 
728
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
729
  auto it = tensors_map.name_to_idx.find(name);
730
  if (it == tensors_map.name_to_idx.end()) {
731
  throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
@@ -739,7 +738,7 @@ struct llama_v3_model_loader {
739
  return get_tensor_for(lt, backend);
740
  }
741
 
742
- struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend backend) {
743
  struct ggml_tensor * tensor;
744
  if (backend != GGML_BACKEND_CPU) {
745
  ggml_set_no_alloc(ggml_ctx, true);
@@ -1230,8 +1229,8 @@ static void llama_v3_model_load_internal(
1230
 
1231
  // "output" tensor
1232
  {
1233
- ggml_backend backend_norm;
1234
- ggml_backend backend_output;
1235
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1236
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1237
  // on Windows however this is detrimental unless everything is on the GPU
@@ -1261,8 +1260,8 @@ static void llama_v3_model_load_internal(
1261
 
1262
  model.layers.resize(n_layer);
1263
  for (uint32_t i = 0; i < n_layer; ++i) {
1264
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
1265
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
1266
 
1267
  auto & layer = model.layers[i];
1268
 
 
63
  #define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
64
  #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
65
 
 
 
66
  #include "ggml-alloc.h"
67
+ #if !defined(GGML_USE_CUBLAS)
68
  #define LLAMA_V3_USE_ALLOCATOR
69
  #else
70
  #define LLAMA_V3_USE_SCRATCH
 
724
  }
725
  }
726
 
727
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend_type backend) {
728
  auto it = tensors_map.name_to_idx.find(name);
729
  if (it == tensors_map.name_to_idx.end()) {
730
  throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
 
738
  return get_tensor_for(lt, backend);
739
  }
740
 
741
+ struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend_type backend) {
742
  struct ggml_tensor * tensor;
743
  if (backend != GGML_BACKEND_CPU) {
744
  ggml_set_no_alloc(ggml_ctx, true);
 
1229
 
1230
  // "output" tensor
1231
  {
1232
+ ggml_backend_type backend_norm;
1233
+ ggml_backend_type backend_output;
1234
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1235
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1236
  // on Windows however this is detrimental unless everything is on the GPU
 
1260
 
1261
  model.layers.resize(n_layer);
1262
  for (uint32_t i = 0; i < n_layer; ++i) {
1263
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
1264
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
1265
 
1266
  auto & layer = model.layers[i];
1267
 
prompts/mnemonics.txt ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ For each kanji character, write a Markdown‐formatted mnemonic that uses its keyword and the keyword of all its components.
2
+
3
+ Kanji: 欠 (lack of)
4
+ Components: 𠂊 (hook claw), 人 (person)
5
+ Mnemonic: This **person** is a pirate. He lost his hand to a crocodile many years ago. Nowadays, the ***lack of*** a hand does not bother him too much. In fact, the **hook claw** that replaces it is the mark of a true pirate, so he is quite proud of it!
6
+
7
+ Kanji: 類 (kind (of something))
8
+ Components: 米 (rice), 大 (large), 頁 (page)
9
+ Mnemonic: The waiter at a Chinese restaurant hands you a **large** menu. Each **page** has all ***kinds*** of **rice** on offer!
10
+
11
+ Kanji: 燃 (burn)
12
+ Components: 火 (fire), 然 (sort of thing)
13
+ Mnemonic: ***Burning*** things up with **fire** is just my **sort of thing**. (Spoken like a true pyromaniac.)
14
+
15
+ Kanji: 頂 (top of)
16
+ Components: 丁 (street), 頁 (page)
17
+ Mnemonic: To be at the ***top of*** your game, you need both practical knowledge (**street** smarts) and theoretical knowledge (having read many **pages**).
18
+
19
+ Kanji: 険 (risky and steep)
20
+ Components: 阝 (small village), 㑒 (consensus)
21
+ Mnemonic: Everyone agrees (there is **consensus**) that the path to the **small village** is ***risky and steep***.
22
+
23
+ Kanji: 困 (distressed)
24
+ Components: 囗 (closed box), 木 (tree)
25
+ Mnemonic: You would feel ***distressed*** too if you were a **tree** trapped in a **closed box**! I have no place to grow!
26
+
27
+ Kanji: 頭 (head)
28
+ Components: 豆 (bean), 頁 (page)
29
+ Mnemonic: What do you have in that ***head*** of yours? A **bean** for a brain? Go read more **pages** and become more knowledgeable about the world!
30
+
31
+ Kanji: 確 (certain)
32
+ Components: 石 (stone), 冖 (roof without a chimney), 隹 (old bird)
33
+ Mnemonic: An **old bird** has made a nest on your **roof**. What do you do? You call Misaka from a <cite>A ***Certain*** Scientific Railgun</cite> to get rid of it, of course! But she doesn’t really want to vaporize the poor thing, so she just throws a **stone** to scare it away. (What was the point of calling her, then‽)
34
+
35
+ Kanji: 魚 (fish)
36
+ Components: 𠂊 (hook claw), 田 (rice field), 灬 (fire sparks)
37
+ Mnemonic: Catch ***fish*** with a **hook**, collect rice from the **rice field**, cook them with **fire**… And my meal is ready!
38
+
39
+ Kanji: 警 (to police (something))
40
+ Components: 敬 (respect), 言 (say)
41
+ Mnemonic: ***To police something*** is to make people **respect** what the law **says**.
42
+
43
+ Kanji: 筆 (writing brush)
44
+ Components: 竹 (bamboo), 聿 (brush)
45
+ Mnemonic: A traditional ***writing brush*** is a **brush** made of **bamboo**.
46
+
47
+ Kanji: 獄 (prison)
48
+ Components: 犭 (animal), 言 (say), 犬 (dog)
49
+ Mnemonic: In ***prison***, like in the **animal** kingdom, only the toughest survive. You have to watch what you **say**. It’s a **dog**‐eat‐dog world.
50
+
51
+ Kanji: 新 (new)
52
+ Components: 立 (standing up), 木 (tree), 斤 (axe)
53
+ Mnemonic: In order for a ***new*** construction to be made, an empty lot is needed. If there are any **trees** **standing up**, they must be cut down with an **axe**.
54
+
55
+ Kanji: 怪 (suspicious)
56
+ Components: 忄 (weak heart), 圣 (sacred)
57
+ Mnemonic: That painting of the **Sacred** **Heart** of Jesus looks ***suspicious***. I think it might be a forgery.
58
+
59
+ Kanji: 温 (warm (to the touch))
60
+ Components: 氵 (water drops), 日 (sun), 皿 (dish)
61
+ Mnemonic: If you leave **water** on a **dish** in the **sun**, it will get ***warm***.
62
+
63
+ Kanji: 階 (floor (of a building))
64
+ Components: 阝 (small village), 皆 (all)
65
+ Mnemonic: It might be a **small village**, but, despite that, **all** of its buildings have many ***floors***. It’s a village of skyscrapers!
66
+
67
+ Kanji: 多 (many)
68
+ Components: 夕 (evening (before sunset)), 夕 (evening (before sunset))
69
+ Mnemonic: Two **evenings** in a day would be one too ***many***.
70
+
71
+ Kanji: 別 (separate)
72
+ Components: 口 (mouth), 万 (ten thousand), 刂 (knife)
73
+ Mnemonic: Tom Six is at it again. For his next flick, he wants to stitch together **ten thousand** people, **mouth**‐to‐anus. One of the most graphic and disturbing scenes will feature one of the victims using a **knife** to ***separate*** perself.
74
+
75
+ Kanji: 並 (line up)
76
+ Components: 䒑 (antlers on a wall), 业 (runway)
77
+ Mnemonic: In order to land a plane you have to ***line up*** properly with the **runway**. The things that look like **antlers** at the end of the runway are the control towers; you should follow their instructions.
78
+
79
+ Kanji: 姿 (figure)
80
+ Components: 次 (next), 女 (woman)
81
+ Mnemonic: The **next** **woman** that I date will have a perfect **figure**. Because I’m done with 3D women—it will *literally* be an anime figure!
82
+
83
+ Kanji: 実 (real)
84
+ Components: 宀 (roof with a chimney), 𡗗 (three people)
85
+ Mnemonic: Living under a **roof with a chimney** with **three people** (a wife and two children)—a happy family life—is not something I could have ever imagined. It does not feel ***real***.
86
+
87
+ Kanji: 謝 (apologize)
88
+ Components: 言 (say), 射 (shoot)
89
+ Mnemonic: **Shot** first, ***apologize*** (**say** you are sorry) later.
90
+
91
+ Kanji: 提 (propose)
92
+ Components: 扌 (left hand), 是 (go with)
93
+ Mnemonic: