Illumotion commited on
Commit
7243d06
1 Parent(s): da7d918

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -45,7 +45,6 @@ models-mnt
45
  /server
46
  /simple
47
  /batched
48
- /batched-bench
49
  /export-lora
50
  /finetune
51
  /speculative
@@ -107,6 +106,3 @@ tests/test-tokenizer-1-bpe
107
  rocblas.dll
108
  hipblas.dll
109
  koboldcpp_hipblas.so
110
-
111
- # Jetbrains idea folder
112
- .idea/
 
45
  /server
46
  /simple
47
  /batched
 
48
  /export-lora
49
  /finetune
50
  /speculative
 
106
  rocblas.dll
107
  hipblas.dll
108
  koboldcpp_hipblas.so
 
 
 
CMakeLists.txt CHANGED
@@ -356,8 +356,6 @@ add_library(ggml OBJECT
356
  ggml.h
357
  ggml-alloc.c
358
  ggml-alloc.h
359
- ggml-backend.c
360
- ggml-backend.h
361
  k_quants.h
362
  k_quants.c
363
  ${GGML_SOURCES_CUDA})
 
356
  ggml.h
357
  ggml-alloc.c
358
  ggml-alloc.h
 
 
359
  k_quants.h
360
  k_quants.c
361
  ${GGML_SOURCES_CUDA})
Makefile CHANGED
@@ -372,8 +372,6 @@ endif # LLAMA_NO_K_QUANTS
372
  #there's no intrinsics or special gpu ops used here, so we can have a universal object
373
  ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
374
  $(CC) $(CFLAGS) -c $< -o $@
375
- ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
376
- $(CC) $(CFLAGS) -c $< -o $@
377
 
378
  #version 2 libs
379
  ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
@@ -404,7 +402,7 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
404
  $(CC) $(CFLAGS) -c $< -o $@
405
 
406
  # intermediate objects
407
- llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
408
  $(CXX) $(CXXFLAGS) -c $< -o $@
409
  common.o: common/common.cpp common/common.h common/log.h
410
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -429,7 +427,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
429
  clean:
430
  rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
431
 
432
- main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
433
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
434
  @echo
435
  @echo '==== Run ./main -h for help. ===='
@@ -440,11 +438,11 @@ gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
440
 
441
 
442
  #generated libraries
443
- koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
444
  $(DEFAULT_BUILD)
445
 
446
  ifdef OPENBLAS_BUILD
447
- koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
448
  $(OPENBLAS_BUILD)
449
  else
450
  koboldcpp_openblas:
@@ -452,7 +450,7 @@ koboldcpp_openblas:
452
  endif
453
 
454
  ifdef FAILSAFE_BUILD
455
- koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
456
  $(FAILSAFE_BUILD)
457
  else
458
  koboldcpp_failsafe:
@@ -460,7 +458,7 @@ koboldcpp_failsafe:
460
  endif
461
 
462
  ifdef NOAVX2_BUILD
463
- koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
464
  $(NOAVX2_BUILD)
465
  else
466
  koboldcpp_noavx2:
@@ -468,7 +466,7 @@ koboldcpp_noavx2:
468
  endif
469
 
470
  ifdef CLBLAST_BUILD
471
- koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
472
  $(CLBLAST_BUILD)
473
  else
474
  koboldcpp_clblast:
@@ -476,7 +474,7 @@ koboldcpp_clblast:
476
  endif
477
 
478
  ifdef CUBLAS_BUILD
479
- koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
480
  $(CUBLAS_BUILD)
481
  else
482
  koboldcpp_cublas:
@@ -484,7 +482,7 @@ koboldcpp_cublas:
484
  endif
485
 
486
  ifdef HIPBLAS_BUILD
487
- koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(HIP_OBJS) $(OBJS)
488
  $(HIPBLAS_BUILD)
489
  else
490
  koboldcpp_hipblas:
@@ -492,15 +490,15 @@ koboldcpp_hipblas:
492
  endif
493
 
494
  # tools
495
- quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o
496
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
497
- quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
498
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
499
- quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
500
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
501
- quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
502
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
503
- quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
504
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
505
 
506
 
 
372
  #there's no intrinsics or special gpu ops used here, so we can have a universal object
373
  ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
374
  $(CC) $(CFLAGS) -c $< -o $@
 
 
375
 
376
  #version 2 libs
377
  ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
 
402
  $(CC) $(CFLAGS) -c $< -o $@
403
 
404
  # intermediate objects
405
+ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
406
  $(CXX) $(CXXFLAGS) -c $< -o $@
407
  common.o: common/common.cpp common/common.h common/log.h
408
  $(CXX) $(CXXFLAGS) -c $< -o $@
 
427
  clean:
428
  rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
429
 
430
+ main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
431
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
432
  @echo
433
  @echo '==== Run ./main -h for help. ===='
 
438
 
439
 
440
  #generated libraries
441
+ koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
442
  $(DEFAULT_BUILD)
443
 
444
  ifdef OPENBLAS_BUILD
445
+ koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
446
  $(OPENBLAS_BUILD)
447
  else
448
  koboldcpp_openblas:
 
450
  endif
451
 
452
  ifdef FAILSAFE_BUILD
453
+ koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o grammar-parser.o $(OBJS)
454
  $(FAILSAFE_BUILD)
455
  else
456
  koboldcpp_failsafe:
 
458
  endif
459
 
460
  ifdef NOAVX2_BUILD
461
+ koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o grammar-parser.o $(OBJS)
462
  $(NOAVX2_BUILD)
463
  else
464
  koboldcpp_noavx2:
 
466
  endif
467
 
468
  ifdef CLBLAST_BUILD
469
+ koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
470
  $(CLBLAST_BUILD)
471
  else
472
  koboldcpp_clblast:
 
474
  endif
475
 
476
  ifdef CUBLAS_BUILD
477
+ koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
478
  $(CUBLAS_BUILD)
479
  else
480
  koboldcpp_cublas:
 
482
  endif
483
 
484
  ifdef HIPBLAS_BUILD
485
+ koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
486
  $(HIPBLAS_BUILD)
487
  else
488
  koboldcpp_hipblas:
 
490
  endif
491
 
492
  # tools
493
+ quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o
494
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
495
+ quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
496
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
497
+ quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
498
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
499
+ quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
500
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
501
+ quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
502
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
503
 
504
 
Package.swift CHANGED
@@ -1,10 +1,10 @@
1
- // swift-tools-version:5.5
2
 
3
  import PackageDescription
4
 
5
  #if arch(arm) || arch(arm64)
6
  let platforms: [SupportedPlatform]? = [
7
- .macOS(.v12),
8
  .iOS(.v14),
9
  .watchOS(.v4),
10
  .tvOS(.v14)
@@ -41,13 +41,12 @@ let package = Package(
41
  "ggml.c",
42
  "llama.cpp",
43
  "ggml-alloc.c",
44
- "ggml-backend.c",
45
  "k_quants.c",
46
  ] + additionalSources,
47
  resources: resources,
48
  publicHeadersPath: "spm-headers",
49
  cSettings: [
50
- .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
51
  .define("GGML_USE_K_QUANTS"),
52
  .define("GGML_USE_ACCELERATE")
53
  // NOTE: NEW_LAPACK will required iOS version 16.4+
 
1
+ // swift-tools-version:5.3
2
 
3
  import PackageDescription
4
 
5
  #if arch(arm) || arch(arm64)
6
  let platforms: [SupportedPlatform]? = [
7
+ .macOS(.v11),
8
  .iOS(.v14),
9
  .watchOS(.v4),
10
  .tvOS(.v14)
 
41
  "ggml.c",
42
  "llama.cpp",
43
  "ggml-alloc.c",
 
44
  "k_quants.c",
45
  ] + additionalSources,
46
  resources: resources,
47
  publicHeadersPath: "spm-headers",
48
  cSettings: [
49
+ .unsafeFlags(["-Wno-shorten-64-to-32"]),
50
  .define("GGML_USE_K_QUANTS"),
51
  .define("GGML_USE_ACCELERATE")
52
  // NOTE: NEW_LAPACK will required iOS version 16.4+
colab.ipynb CHANGED
@@ -33,7 +33,7 @@
33
  "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
34
  "!sleep 10\r\n",
35
  "!cat nohup.out\r\n",
36
- "!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers\r\n"
37
  ]
38
  }
39
  ],
 
33
  "!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
34
  "!sleep 10\r\n",
35
  "!cat nohup.out\r\n",
36
+ "!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers --hordeconfig concedo\r\n"
37
  ]
38
  }
39
  ],
common/CMakeLists.txt CHANGED
@@ -5,8 +5,6 @@ set(TARGET common)
5
  add_library(${TARGET} OBJECT
6
  common.h
7
  common.cpp
8
- sampling.h
9
- sampling.cpp
10
  console.h
11
  console.cpp
12
  grammar-parser.h
 
5
  add_library(${TARGET} OBJECT
6
  common.h
7
  common.cpp
 
 
8
  console.h
9
  console.cpp
10
  grammar-parser.h
common/common.cpp CHANGED
@@ -107,7 +107,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
107
  std::string arg;
108
  gpt_params default_params;
109
  const std::string arg_prefix = "--";
110
- llama_sampling_params & sparams = params.sampling_params;
111
 
112
  for (int i = 1; i < argc; i++) {
113
  arg = argv[i];
@@ -185,7 +184,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
185
  invalid_param = true;
186
  break;
187
  }
188
- sparams.top_k = std::stoi(argv[i]);
189
  } else if (arg == "-c" || arg == "--ctx-size") {
190
  if (++i >= argc) {
191
  invalid_param = true;
@@ -217,73 +216,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
217
  invalid_param = true;
218
  break;
219
  }
220
- sparams.top_p = std::stof(argv[i]);
221
  } else if (arg == "--temp") {
222
  if (++i >= argc) {
223
  invalid_param = true;
224
  break;
225
  }
226
- sparams.temp = std::stof(argv[i]);
227
  } else if (arg == "--tfs") {
228
  if (++i >= argc) {
229
  invalid_param = true;
230
  break;
231
  }
232
- sparams.tfs_z = std::stof(argv[i]);
233
  } else if (arg == "--typical") {
234
  if (++i >= argc) {
235
  invalid_param = true;
236
  break;
237
  }
238
- sparams.typical_p = std::stof(argv[i]);
239
  } else if (arg == "--repeat-last-n") {
240
  if (++i >= argc) {
241
  invalid_param = true;
242
  break;
243
  }
244
- sparams.repeat_last_n = std::stoi(argv[i]);
245
  } else if (arg == "--repeat-penalty") {
246
  if (++i >= argc) {
247
  invalid_param = true;
248
  break;
249
  }
250
- sparams.repeat_penalty = std::stof(argv[i]);
251
  } else if (arg == "--frequency-penalty") {
252
  if (++i >= argc) {
253
  invalid_param = true;
254
  break;
255
  }
256
- sparams.frequency_penalty = std::stof(argv[i]);
257
  } else if (arg == "--presence-penalty") {
258
  if (++i >= argc) {
259
  invalid_param = true;
260
  break;
261
  }
262
- sparams.presence_penalty = std::stof(argv[i]);
263
  } else if (arg == "--mirostat") {
264
  if (++i >= argc) {
265
  invalid_param = true;
266
  break;
267
  }
268
- sparams.mirostat = std::stoi(argv[i]);
269
  } else if (arg == "--mirostat-lr") {
270
  if (++i >= argc) {
271
  invalid_param = true;
272
  break;
273
  }
274
- sparams.mirostat_eta = std::stof(argv[i]);
275
  } else if (arg == "--mirostat-ent") {
276
  if (++i >= argc) {
277
  invalid_param = true;
278
  break;
279
  }
280
- sparams.mirostat_tau = std::stof(argv[i]);
281
  } else if (arg == "--cfg-negative-prompt") {
282
  if (++i >= argc) {
283
  invalid_param = true;
284
  break;
285
  }
286
- sparams.cfg_negative_prompt = argv[i];
287
  } else if (arg == "--cfg-negative-prompt-file") {
288
  if (++i >= argc) {
289
  invalid_param = true;
@@ -295,16 +294,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
295
  invalid_param = true;
296
  break;
297
  }
298
- std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
299
- if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
300
- sparams.cfg_negative_prompt.pop_back();
301
  }
302
  } else if (arg == "--cfg-scale") {
303
  if (++i >= argc) {
304
  invalid_param = true;
305
  break;
306
  }
307
- sparams.cfg_scale = std::stof(argv[i]);
308
  } else if (arg == "-b" || arg == "--batch-size") {
309
  if (++i >= argc) {
310
  invalid_param = true;
@@ -513,7 +512,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
513
  } else if (arg == "--ignore-eos") {
514
  params.ignore_eos = true;
515
  } else if (arg == "--no-penalize-nl") {
516
- sparams.penalize_nl = false;
517
  } else if (arg == "-l" || arg == "--logit-bias") {
518
  if (++i >= argc) {
519
  invalid_param = true;
@@ -525,7 +524,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
525
  std::string value_str;
526
  try {
527
  if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
528
- sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
529
  } else {
530
  throw std::exception();
531
  }
@@ -628,8 +627,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
628
  }
629
 
630
  void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
631
- const llama_sampling_params & sparams = params.sampling_params;
632
-
633
  printf("usage: %s [options]\n", argv[0]);
634
  printf("\n");
635
  printf("options:\n");
@@ -662,19 +659,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
662
  printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
663
  printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
664
  printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
665
- printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
666
- printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
667
- printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
668
- printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
669
- printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
670
- printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
671
- printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
672
- printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
673
  printf(" --mirostat N use Mirostat sampling.\n");
674
  printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
675
- printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
676
- printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
677
- printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
678
  printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
679
  printf(" modifies the likelihood of token appearing in the completion,\n");
680
  printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
@@ -685,7 +682,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
685
  printf(" negative prompt to use for guidance. (default: empty)\n");
686
  printf(" --cfg-negative-prompt-file FNAME\n");
687
  printf(" negative prompt file to use for guidance. (default: empty)\n");
688
- printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
689
  printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
690
  printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
691
  printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
@@ -693,7 +690,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
693
  printf(" --no-penalize-nl do not penalize newline token\n");
694
  printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
695
  printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
696
- printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
697
  printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
698
  printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
699
  printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@@ -843,7 +840,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
843
  }
844
 
845
  if (params.ignore_eos) {
846
- params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
847
  }
848
 
849
  {
@@ -935,6 +932,127 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
935
  return result;
936
  }
937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
  //
939
  // YAML utils
940
  //
@@ -1086,8 +1204,6 @@ std::string get_sortable_timestamp() {
1086
 
1087
  void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
1088
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
1089
- const llama_sampling_params & sparams = params.sampling_params;
1090
-
1091
  fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
1092
  fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
1093
  fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
@@ -1134,21 +1250,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1134
 
1135
  fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
1136
  fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
1137
- dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
1138
- fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
1139
  fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
1140
  fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
1141
  fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
1142
  fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
1143
  fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
1144
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
1145
  dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
1146
  fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
1147
  fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
1148
  fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
1149
 
1150
- const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
1151
- const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
1152
  fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
1153
 
1154
  dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
@@ -1161,7 +1277,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1161
  fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
1162
 
1163
  fprintf(stream, "logit_bias:\n");
1164
- for (std::pair<llama_token, float> lb : sparams.logit_bias) {
1165
  if (ignore_eos && lb.first == logit_bias_eos->first) {
1166
  continue;
1167
  }
@@ -1185,30 +1301,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1185
  fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
1186
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1187
  fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
1188
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
1189
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
1190
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
1191
  fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
1192
  fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
1193
  fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
1194
  fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
1195
  fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
1196
  fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
1197
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
1198
  fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
1199
  fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
1200
- fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
1201
  fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
1202
  fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
1203
  fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
1204
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
1205
  dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
1206
  fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
1207
  fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
1208
  fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
1209
  dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
1210
  fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
1211
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
1212
 
1213
  fprintf(stream, "reverse_prompt:\n");
1214
  for (std::string ap : params.antiprompt) {
@@ -1226,15 +1342,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1226
  fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
1227
  fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
1228
  fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
1229
- fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
1230
 
1231
  const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
1232
  dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
1233
 
1234
- fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
1235
  fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
1236
- fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
1237
- fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
1238
- fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
1239
  fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
1240
  }
 
107
  std::string arg;
108
  gpt_params default_params;
109
  const std::string arg_prefix = "--";
 
110
 
111
  for (int i = 1; i < argc; i++) {
112
  arg = argv[i];
 
184
  invalid_param = true;
185
  break;
186
  }
187
+ params.top_k = std::stoi(argv[i]);
188
  } else if (arg == "-c" || arg == "--ctx-size") {
189
  if (++i >= argc) {
190
  invalid_param = true;
 
216
  invalid_param = true;
217
  break;
218
  }
219
+ params.top_p = std::stof(argv[i]);
220
  } else if (arg == "--temp") {
221
  if (++i >= argc) {
222
  invalid_param = true;
223
  break;
224
  }
225
+ params.temp = std::stof(argv[i]);
226
  } else if (arg == "--tfs") {
227
  if (++i >= argc) {
228
  invalid_param = true;
229
  break;
230
  }
231
+ params.tfs_z = std::stof(argv[i]);
232
  } else if (arg == "--typical") {
233
  if (++i >= argc) {
234
  invalid_param = true;
235
  break;
236
  }
237
+ params.typical_p = std::stof(argv[i]);
238
  } else if (arg == "--repeat-last-n") {
239
  if (++i >= argc) {
240
  invalid_param = true;
241
  break;
242
  }
243
+ params.repeat_last_n = std::stoi(argv[i]);
244
  } else if (arg == "--repeat-penalty") {
245
  if (++i >= argc) {
246
  invalid_param = true;
247
  break;
248
  }
249
+ params.repeat_penalty = std::stof(argv[i]);
250
  } else if (arg == "--frequency-penalty") {
251
  if (++i >= argc) {
252
  invalid_param = true;
253
  break;
254
  }
255
+ params.frequency_penalty = std::stof(argv[i]);
256
  } else if (arg == "--presence-penalty") {
257
  if (++i >= argc) {
258
  invalid_param = true;
259
  break;
260
  }
261
+ params.presence_penalty = std::stof(argv[i]);
262
  } else if (arg == "--mirostat") {
263
  if (++i >= argc) {
264
  invalid_param = true;
265
  break;
266
  }
267
+ params.mirostat = std::stoi(argv[i]);
268
  } else if (arg == "--mirostat-lr") {
269
  if (++i >= argc) {
270
  invalid_param = true;
271
  break;
272
  }
273
+ params.mirostat_eta = std::stof(argv[i]);
274
  } else if (arg == "--mirostat-ent") {
275
  if (++i >= argc) {
276
  invalid_param = true;
277
  break;
278
  }
279
+ params.mirostat_tau = std::stof(argv[i]);
280
  } else if (arg == "--cfg-negative-prompt") {
281
  if (++i >= argc) {
282
  invalid_param = true;
283
  break;
284
  }
285
+ params.cfg_negative_prompt = argv[i];
286
  } else if (arg == "--cfg-negative-prompt-file") {
287
  if (++i >= argc) {
288
  invalid_param = true;
 
294
  invalid_param = true;
295
  break;
296
  }
297
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
298
+ if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
299
+ params.cfg_negative_prompt.pop_back();
300
  }
301
  } else if (arg == "--cfg-scale") {
302
  if (++i >= argc) {
303
  invalid_param = true;
304
  break;
305
  }
306
+ params.cfg_scale = std::stof(argv[i]);
307
  } else if (arg == "-b" || arg == "--batch-size") {
308
  if (++i >= argc) {
309
  invalid_param = true;
 
512
  } else if (arg == "--ignore-eos") {
513
  params.ignore_eos = true;
514
  } else if (arg == "--no-penalize-nl") {
515
+ params.penalize_nl = false;
516
  } else if (arg == "-l" || arg == "--logit-bias") {
517
  if (++i >= argc) {
518
  invalid_param = true;
 
524
  std::string value_str;
525
  try {
526
  if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
527
+ params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
528
  } else {
529
  throw std::exception();
530
  }
 
627
  }
628
 
629
  void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 
 
630
  printf("usage: %s [options]\n", argv[0]);
631
  printf("\n");
632
  printf("options:\n");
 
659
  printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
660
  printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
661
  printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
662
+ printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
663
+ printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
664
+ printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
665
+ printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
666
+ printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
667
+ printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
668
+ printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
669
+ printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
670
  printf(" --mirostat N use Mirostat sampling.\n");
671
  printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
672
+ printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
673
+ printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
674
+ printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
675
  printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
676
  printf(" modifies the likelihood of token appearing in the completion,\n");
677
  printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
 
682
  printf(" negative prompt to use for guidance. (default: empty)\n");
683
  printf(" --cfg-negative-prompt-file FNAME\n");
684
  printf(" negative prompt file to use for guidance. (default: empty)\n");
685
+ printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
686
  printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
687
  printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
688
  printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
 
690
  printf(" --no-penalize-nl do not penalize newline token\n");
691
  printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
692
  printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
693
+ printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
694
  printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
695
  printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
696
  printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
 
840
  }
841
 
842
  if (params.ignore_eos) {
843
+ params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
844
  }
845
 
846
  {
 
932
  return result;
933
  }
934
 
935
+ //
936
+ // Sampling utils
937
+ //
938
+
939
+ llama_token llama_sample_token(
940
+ struct llama_context * ctx,
941
+ struct llama_context * ctx_guidance,
942
+ struct llama_grammar * grammar,
943
+ const struct gpt_params & params,
944
+ const std::vector<llama_token> & last_tokens,
945
+ std::vector<llama_token_data> & candidates,
946
+ int idx) {
947
+ const int n_ctx = llama_n_ctx(ctx);
948
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
949
+
950
+ const float temp = params.temp;
951
+ const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
952
+ const float top_p = params.top_p;
953
+ const float tfs_z = params.tfs_z;
954
+ const float typical_p = params.typical_p;
955
+ const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
956
+ const float repeat_penalty = params.repeat_penalty;
957
+ const float alpha_presence = params.presence_penalty;
958
+ const float alpha_frequency = params.frequency_penalty;
959
+ const int mirostat = params.mirostat;
960
+ const float mirostat_tau = params.mirostat_tau;
961
+ const float mirostat_eta = params.mirostat_eta;
962
+ const bool penalize_nl = params.penalize_nl;
963
+
964
+ llama_token id = 0;
965
+
966
+ float * logits = llama_get_logits_ith(ctx, idx);
967
+
968
+ // Apply params.logit_bias map
969
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
970
+ logits[it->first] += it->second;
971
+ }
972
+
973
+ candidates.clear();
974
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
975
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
976
+ }
977
+
978
+ llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
979
+
980
+ if (ctx_guidance) {
981
+ llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
982
+ }
983
+
984
+ // apply penalties
985
+ if (!last_tokens.empty()) {
986
+ const float nl_logit = logits[llama_token_nl(ctx)];
987
+ const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
988
+
989
+ llama_sample_repetition_penalty(ctx, &cur_p,
990
+ last_tokens.data() + last_tokens.size() - last_n_repeat,
991
+ last_n_repeat, repeat_penalty);
992
+ llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
993
+ last_tokens.data() + last_tokens.size() - last_n_repeat,
994
+ last_n_repeat, alpha_frequency, alpha_presence);
995
+
996
+ if (!penalize_nl) {
997
+ for (size_t idx = 0; idx < cur_p.size; idx++) {
998
+ if (cur_p.data[idx].id == llama_token_nl(ctx)) {
999
+ cur_p.data[idx].logit = nl_logit;
1000
+ break;
1001
+ }
1002
+ }
1003
+ }
1004
+ }
1005
+
1006
+ if (grammar != NULL) {
1007
+ llama_sample_grammar(ctx, &cur_p, grammar);
1008
+ }
1009
+
1010
+ if (temp <= 0) {
1011
+ // Greedy sampling
1012
+ id = llama_sample_token_greedy(ctx, &cur_p);
1013
+ } else {
1014
+ if (mirostat == 1) {
1015
+ static float mirostat_mu = 2.0f * mirostat_tau;
1016
+ const int mirostat_m = 100;
1017
+ llama_sample_temp(ctx, &cur_p, temp);
1018
+ id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
1019
+ } else if (mirostat == 2) {
1020
+ static float mirostat_mu = 2.0f * mirostat_tau;
1021
+ llama_sample_temp(ctx, &cur_p, temp);
1022
+ id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
1023
+ } else {
1024
+ // Temperature sampling
1025
+ size_t min_keep = std::max(1, params.n_probs);
1026
+ llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
1027
+ llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
1028
+ llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
1029
+ llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
1030
+ llama_sample_temp(ctx, &cur_p, temp);
1031
+
1032
+ {
1033
+ const int n_top = 10;
1034
+ LOG("top %d candidates:\n", n_top);
1035
+
1036
+ for (int i = 0; i < n_top; i++) {
1037
+ const llama_token id = cur_p.data[i].id;
1038
+ LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
1039
+ }
1040
+ }
1041
+
1042
+ id = llama_sample_token(ctx, &cur_p);
1043
+
1044
+ LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
1045
+ }
1046
+ }
1047
+ // printf("`%d`", candidates_p.size);
1048
+
1049
+ if (grammar != NULL) {
1050
+ llama_grammar_accept_token(ctx, grammar, id);
1051
+ }
1052
+
1053
+ return id;
1054
+ }
1055
+
1056
  //
1057
  // YAML utils
1058
  //
 
1204
 
1205
  void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
1206
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
 
 
1207
  fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
1208
  fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
1209
  fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
 
1250
 
1251
  fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
1252
  fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
1253
+ dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
1254
+ fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
1255
  fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
1256
  fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
1257
  fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
1258
  fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
1259
  fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
1260
+ fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
1261
  dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
1262
  fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
1263
  fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
1264
  fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
1265
 
1266
+ const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
1267
+ const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
1268
  fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
1269
 
1270
  dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
 
1277
  fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
1278
 
1279
  fprintf(stream, "logit_bias:\n");
1280
+ for (std::pair<llama_token, float> lb : params.logit_bias) {
1281
  if (ignore_eos && lb.first == logit_bias_eos->first) {
1282
  continue;
1283
  }
 
1301
  fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
1302
  fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
1303
  fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
1304
+ fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
1305
+ fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
1306
+ fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
1307
  fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
1308
  fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
1309
  fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
1310
  fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
1311
  fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
1312
  fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
1313
+ fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
1314
  fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
1315
  fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
1316
+ fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
1317
  fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
1318
  fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
1319
  fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
1320
+ fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
1321
  dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
1322
  fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
1323
  fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
1324
  fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
1325
  dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
1326
  fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
1327
+ fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
1328
 
1329
  fprintf(stream, "reverse_prompt:\n");
1330
  for (std::string ap : params.antiprompt) {
 
1342
  fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
1343
  fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
1344
  fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
1345
+ fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
1346
 
1347
  const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
1348
  dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
1349
 
1350
+ fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
1351
  fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
1352
+ fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
1353
+ fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
1354
+ fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
1355
  fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
1356
  }
common/common.h CHANGED
@@ -4,8 +4,6 @@
4
 
5
  #include "llama.h"
6
 
7
- #include "sampling.h"
8
-
9
  #define LOG_NO_FILE_LINE_FUNCTION
10
  #include "log.h"
11
 
@@ -51,6 +49,7 @@ struct gpt_params {
51
  int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
52
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
53
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
 
54
  int32_t n_beams = 0; // if non-zero then use beam search of given width.
55
  float rope_freq_base = 0.0f; // RoPE base frequency
56
  float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
@@ -68,8 +67,13 @@ struct gpt_params {
68
  int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
69
  float mirostat_tau = 5.00f; // target entropy
70
  float mirostat_eta = 0.10f; // learning rate
71
- // // sampling parameters
72
- struct llama_sampling_params sampling_params;
 
 
 
 
 
73
 
74
  std::string model = "models/7B/ggml-model-f16.gguf"; // model path
75
  std::string model_draft = ""; // draft model for speculative decoding
@@ -111,6 +115,7 @@ struct gpt_params {
111
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
112
  bool ignore_eos = false; // ignore generated EOS tokens
113
  bool instruct = false; // instruction mode (used for Alpaca models)
 
114
  bool logits_all = false; // return logits for all tokens in the batch
115
  bool use_mmap = true; // use mmap for faster loads
116
  bool use_mlock = false; // use mlock to keep model in memory
@@ -175,6 +180,36 @@ std::string llama_detokenize_bpe(
175
  llama_context * ctx,
176
  const std::vector<llama_token> & tokens);
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  //
179
  // YAML utils
180
  //
 
4
 
5
  #include "llama.h"
6
 
 
 
7
  #define LOG_NO_FILE_LINE_FUNCTION
8
  #include "log.h"
9
 
 
49
  int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
50
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
51
  float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
52
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
53
  int32_t n_beams = 0; // if non-zero then use beam search of given width.
54
  float rope_freq_base = 0.0f; // RoPE base frequency
55
  float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
 
67
  int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
68
  float mirostat_tau = 5.00f; // target entropy
69
  float mirostat_eta = 0.10f; // learning rate
70
+
71
+ std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
72
+
73
+ // Classifier-Free Guidance
74
+ // https://arxiv.org/abs/2306.17806
75
+ std::string cfg_negative_prompt; // string to help guidance
76
+ float cfg_scale = 1.f; // How strong is guidance
77
 
78
  std::string model = "models/7B/ggml-model-f16.gguf"; // model path
79
  std::string model_draft = ""; // draft model for speculative decoding
 
115
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
116
  bool ignore_eos = false; // ignore generated EOS tokens
117
  bool instruct = false; // instruction mode (used for Alpaca models)
118
+ bool penalize_nl = true; // consider newlines as a repeatable token
119
  bool logits_all = false; // return logits for all tokens in the batch
120
  bool use_mmap = true; // use mmap for faster loads
121
  bool use_mlock = false; // use mlock to keep model in memory
 
180
  llama_context * ctx,
181
  const std::vector<llama_token> & tokens);
182
 
183
+ //
184
+ // Sampling utils
185
+ //
186
+
187
+ // this is a common sampling function used across the examples for convenience
188
+ // it can serve as a starting point for implementing your own sampling function
189
+ //
190
+ // required:
191
+ // - ctx: context to use for sampling
192
+ // - params: sampling parameters
193
+ //
194
+ // optional:
195
+ // - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
196
+ // - grammar: grammar to use for sampling, ignore if NULL
197
+ // - last_tokens: needed for repetition penalty, ignore if empty
198
+ // - idx: sample from llama_get_logits_ith(ctx, idx)
199
+ //
200
+ // returns:
201
+ // - token: sampled token
202
+ // - candidates: vector of candidate tokens
203
+ //
204
+ llama_token llama_sample_token(
205
+ struct llama_context * ctx,
206
+ struct llama_context * ctx_guidance,
207
+ struct llama_grammar * grammar,
208
+ const struct gpt_params & params,
209
+ const std::vector<llama_token> & last_tokens,
210
+ std::vector<llama_token_data> & candidates,
211
+ int idx = 0);
212
+
213
  //
214
  // YAML utils
215
  //
convert-refact-hf-to-gguf.py CHANGED
@@ -17,6 +17,33 @@ if "NO_LOCAL_GGUF" not in os.environ:
17
  sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
18
  import gguf
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def count_model_parts(dir_model: Path) -> int:
21
  num_parts = 0
22
  for filename in os.listdir(dir_model):
@@ -126,25 +153,53 @@ tokens: list[bytearray] = []
126
  scores: list[float] = []
127
  toktypes: list[int] = []
128
 
 
 
 
 
 
129
  # gpt2 tokenizer
130
  gguf_writer.add_tokenizer_model("gpt2")
131
 
132
- print("gguf: get gpt2 tokenizer vocab")
 
133
 
134
- # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
135
- tokenizer = AutoTokenizer.from_pretrained(dir_model)
136
 
137
  # The number of tokens in tokenizer.json can differ from the expected vocab size.
138
  # This causes downstream issues with mismatched tensor sizes when running the inference
139
- vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
140
- assert max(tokenizer.vocab.values()) < vocab_size
 
 
 
 
 
141
 
142
  reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 
 
143
 
144
  for i in range(vocab_size):
145
- tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
146
- scores.append(0.0) # dummy
147
- toktypes.append(gguf.TokenType.NORMAL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  gguf_writer.add_token_list(tokens)
150
  gguf_writer.add_token_scores(scores)
 
17
  sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
18
  import gguf
19
 
20
+
21
+ def bytes_to_unicode():
22
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
23
+ """
24
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
25
+ The reversible bpe codes work on unicode strings.
26
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
27
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
28
+ This is a significant percentage of your normal, say, 32K bpe vocab.
29
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
30
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
31
+ """
32
+ bs = (
33
+ list(range(ord("!"), ord("~") + 1))
34
+ + list(range(ord("¡"), ord("¬") + 1))
35
+ + list(range(ord("®"), ord("ÿ") + 1))
36
+ )
37
+ cs = bs[:]
38
+ n = 0
39
+ for b in range(2**8):
40
+ if b not in bs:
41
+ bs.append(b)
42
+ cs.append(2**8 + n)
43
+ n += 1
44
+ return dict(zip(bs, (chr(n) for n in cs)))
45
+
46
+
47
  def count_model_parts(dir_model: Path) -> int:
48
  num_parts = 0
49
  for filename in os.listdir(dir_model):
 
153
  scores: list[float] = []
154
  toktypes: list[int] = []
155
 
156
+ tokenizer_json_file = dir_model / "tokenizer.json"
157
+ if not tokenizer_json_file.is_file():
158
+ print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
159
+ sys.exit(1)
160
+
161
  # gpt2 tokenizer
162
  gguf_writer.add_tokenizer_model("gpt2")
163
 
164
+ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
165
+ tokenizer_json = json.load(f)
166
 
167
+ print("gguf: get gpt2 tokenizer vocab")
 
168
 
169
  # The number of tokens in tokenizer.json can differ from the expected vocab size.
170
  # This causes downstream issues with mismatched tensor sizes when running the inference
171
+ vocab_size = (
172
+ hparams["vocab_size"]
173
+ if "vocab_size" in hparams
174
+ else len(tokenizer_json["model"]["vocab"])
175
+ )
176
+
177
+ tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
178
 
179
  reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
180
+ byte_encoder = bytes_to_unicode()
181
+ byte_decoder = {v: k for k, v in byte_encoder.items()}
182
 
183
  for i in range(vocab_size):
184
+ if i in reverse_vocab:
185
+ text = reverse_vocab[i]
186
+ try:
187
+ text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
188
+ except KeyError:
189
+ text = bytearray()
190
+ for c in reverse_vocab[i]:
191
+ if ord(c) < 256: # single byte character
192
+ text.append(byte_decoder[ord(c)])
193
+ else: # multibyte special token character
194
+ text.extend(c.encode("utf-8"))
195
+ else:
196
+ print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
197
+ pad_token = f"[PAD{i}]".encode("utf8")
198
+ text = bytearray(pad_token)
199
+
200
+ tokens.append(text)
201
+ scores.append(0.0) # dymmy
202
+ toktypes.append(gguf.TokenType.NORMAL) # dummy
203
 
204
  gguf_writer.add_token_list(tokens)
205
  gguf_writer.add_token_scores(scores)
examples/CMakeLists.txt CHANGED
@@ -25,7 +25,6 @@ else()
25
  add_subdirectory(convert-llama2c-to-ggml)
26
  add_subdirectory(simple)
27
  add_subdirectory(batched)
28
- add_subdirectory(batched-bench)
29
  add_subdirectory(speculative)
30
  add_subdirectory(parallel)
31
  add_subdirectory(embd-input)
 
25
  add_subdirectory(convert-llama2c-to-ggml)
26
  add_subdirectory(simple)
27
  add_subdirectory(batched)
 
28
  add_subdirectory(speculative)
29
  add_subdirectory(parallel)
30
  add_subdirectory(embd-input)
examples/batched/batched.cpp CHANGED
@@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
66
  ctx_params.seed = 1234;
67
  ctx_params.n_ctx = n_kv_req;
68
  ctx_params.n_batch = std::max(n_len, n_parallel);
69
- ctx_params.n_threads = params.n_threads;
70
  ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
71
 
72
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
66
  ctx_params.seed = 1234;
67
  ctx_params.n_ctx = n_kv_req;
68
  ctx_params.n_batch = std::max(n_len, n_parallel);
69
+ ctx_params.n_threads = params.n_threads;
70
  ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
71
 
72
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
examples/embd-input/embd-input-lib.cpp CHANGED
@@ -128,22 +128,21 @@ bool eval_string(struct MyModel * mymodel,const char* str){
128
  llama_token sampling_id(struct MyModel* mymodel) {
129
  llama_context* ctx = mymodel->ctx;
130
  gpt_params params = mymodel->params;
131
- llama_sampling_params & sparams = params.sampling_params;
132
  // int n_ctx = llama_n_ctx(ctx);
133
 
134
  // out of user input, sample next token
135
- const float temp = sparams.temp;
136
- const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
137
- const float top_p = sparams.top_p;
138
- const float tfs_z = sparams.tfs_z;
139
- const float typical_p = sparams.typical_p;
140
  // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
141
  // const float repeat_penalty = params.repeat_penalty;
142
  // const float alpha_presence = params.presence_penalty;
143
  // const float alpha_frequency = params.frequency_penalty;
144
- const int mirostat = sparams.mirostat;
145
- const float mirostat_tau = sparams.mirostat_tau;
146
- const float mirostat_eta = sparams.mirostat_eta;
147
  // const bool penalize_nl = params.penalize_nl;
148
 
149
  llama_token id = 0;
@@ -152,7 +151,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
152
  auto n_vocab = llama_n_vocab(llama_get_model(ctx));
153
 
154
  // Apply params.logit_bias map
155
- for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
156
  logits[it->first] += it->second;
157
  }
158
 
 
128
  llama_token sampling_id(struct MyModel* mymodel) {
129
  llama_context* ctx = mymodel->ctx;
130
  gpt_params params = mymodel->params;
 
131
  // int n_ctx = llama_n_ctx(ctx);
132
 
133
  // out of user input, sample next token
134
+ const float temp = params.temp;
135
+ const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
136
+ const float top_p = params.top_p;
137
+ const float tfs_z = params.tfs_z;
138
+ const float typical_p = params.typical_p;
139
  // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
140
  // const float repeat_penalty = params.repeat_penalty;
141
  // const float alpha_presence = params.presence_penalty;
142
  // const float alpha_frequency = params.frequency_penalty;
143
+ const int mirostat = params.mirostat;
144
+ const float mirostat_tau = params.mirostat_tau;
145
+ const float mirostat_eta = params.mirostat_eta;
146
  // const bool penalize_nl = params.penalize_nl;
147
 
148
  llama_token id = 0;
 
151
  auto n_vocab = llama_n_vocab(llama_get_model(ctx));
152
 
153
  // Apply params.logit_bias map
154
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
155
  logits[it->first] += it->second;
156
  }
157
 
examples/infill/infill.cpp CHANGED
@@ -104,7 +104,6 @@ static void sigint_handler(int signo) {
104
 
105
  int main(int argc, char ** argv) {
106
  gpt_params params;
107
- llama_sampling_params & sparams = params.sampling_params;
108
  g_params = &params;
109
 
110
  if (!gpt_params_parse(argc, argv, params)) {
@@ -207,7 +206,7 @@ int main(int argc, char ** argv) {
207
  // load the model and apply lora adapter, if any
208
  LOG("%s: load the model and apply lora adapter, if any\n", __func__);
209
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
210
- if (sparams.cfg_scale > 1.f) {
211
  struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
212
  ctx_guidance = llama_new_context_with_model(model, lparams);
213
  }
@@ -234,22 +233,10 @@ int main(int argc, char ** argv) {
234
  const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
235
  LOG("add_bos: %d\n", add_bos);
236
 
237
- bool suff_rm_leading_spc = params.escape;
238
- if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
239
- params.input_suffix.erase(0, 1);
240
- suff_rm_leading_spc = false;
241
- }
242
  std::vector<llama_token> embd_inp;
243
- std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
244
- std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
245
- const int space_token = 29871;
246
- if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
247
- inp_sfx.erase(inp_sfx.begin());
248
- }
249
  inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
250
- if (add_bos) {
251
- inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
252
- }
253
  inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
254
  embd_inp = inp_pfx;
255
  embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
@@ -270,9 +257,9 @@ int main(int argc, char ** argv) {
270
  int guidance_offset = 0;
271
  int original_prompt_len = 0;
272
  if (ctx_guidance) {
273
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
274
 
275
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
276
  LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
277
 
278
  std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@@ -313,7 +300,7 @@ int main(int argc, char ** argv) {
313
 
314
  if (ctx_guidance) {
315
  LOG_TEE("\n");
316
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
317
  LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
318
  for (int i = 0; i < (int) guidance_inp.size(); i++) {
319
  LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -359,7 +346,7 @@ int main(int argc, char ** argv) {
359
  }
360
  }
361
  LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
362
- sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
363
  LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
364
  LOG_TEE("\n\n");
365
 
@@ -377,8 +364,8 @@ int main(int argc, char ** argv) {
377
  LOG_TEE("\n");
378
 
379
  {
380
- auto it = sparams.logit_bias.find(llama_token_eos(ctx));
381
- if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
382
  LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
383
  }
384
  }
@@ -435,7 +422,6 @@ int main(int argc, char ** argv) {
435
 
436
  const int n_vocab = llama_n_vocab(model);
437
 
438
- llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
439
  std::vector<llama_token_data> candidates;
440
  candidates.reserve(n_vocab);
441
 
@@ -554,7 +540,7 @@ int main(int argc, char ** argv) {
554
 
555
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
556
 
557
- const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
558
 
559
  last_tokens.erase(last_tokens.begin());
560
  last_tokens.push_back(id);
@@ -641,27 +627,10 @@ int main(int argc, char ** argv) {
641
  buffer.clear();
642
  // done taking input, reset color
643
  console::set_display(console::reset);
644
-
645
- if (params.escape) {
646
- //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
647
- process_escapes(params.input_prefix);
648
- process_escapes(params.input_suffix);
649
- }
650
- suff_rm_leading_spc = params.escape;
651
- if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
652
- params.input_suffix.erase(0, 1);
653
- suff_rm_leading_spc = false;
654
- }
655
  // tokenize new prefix and suffix
656
- std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
657
- std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
658
- if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
659
- inp_sfx.erase(inp_sfx.begin());
660
- }
661
  inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
662
- if (add_bos) {
663
- inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
664
- }
665
  inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
666
  embd_inp = inp_pfx;
667
  embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
 
104
 
105
  int main(int argc, char ** argv) {
106
  gpt_params params;
 
107
  g_params = &params;
108
 
109
  if (!gpt_params_parse(argc, argv, params)) {
 
206
  // load the model and apply lora adapter, if any
207
  LOG("%s: load the model and apply lora adapter, if any\n", __func__);
208
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
209
+ if (params.cfg_scale > 1.f) {
210
  struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
211
  ctx_guidance = llama_new_context_with_model(model, lparams);
212
  }
 
233
  const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
234
  LOG("add_bos: %d\n", add_bos);
235
 
 
 
 
 
 
236
  std::vector<llama_token> embd_inp;
237
+ std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
238
+ std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
 
 
 
 
239
  inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
 
 
 
240
  inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
241
  embd_inp = inp_pfx;
242
  embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
 
257
  int guidance_offset = 0;
258
  int original_prompt_len = 0;
259
  if (ctx_guidance) {
260
+ LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
261
 
262
+ guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
263
  LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
264
 
265
  std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
 
300
 
301
  if (ctx_guidance) {
302
  LOG_TEE("\n");
303
+ LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
304
  LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
305
  for (int i = 0; i < (int) guidance_inp.size(); i++) {
306
  LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
 
346
  }
347
  }
348
  LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
349
+ params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
350
  LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
351
  LOG_TEE("\n\n");
352
 
 
364
  LOG_TEE("\n");
365
 
366
  {
367
+ auto it = params.logit_bias.find(llama_token_eos(ctx));
368
+ if (it != params.logit_bias.end() && it->second == -INFINITY) {
369
  LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
370
  }
371
  }
 
422
 
423
  const int n_vocab = llama_n_vocab(model);
424
 
 
425
  std::vector<llama_token_data> candidates;
426
  candidates.reserve(n_vocab);
427
 
 
540
 
541
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
542
 
543
+ const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
544
 
545
  last_tokens.erase(last_tokens.begin());
546
  last_tokens.push_back(id);
 
627
  buffer.clear();
628
  // done taking input, reset color
629
  console::set_display(console::reset);
 
 
 
 
 
 
 
 
 
 
 
630
  // tokenize new prefix and suffix
631
+ std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
632
+ std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
 
 
 
633
  inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
 
 
 
634
  inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
635
  embd_inp = inp_pfx;
636
  embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
examples/main/main.cpp CHANGED
@@ -109,7 +109,6 @@ int main(int argc, char ** argv) {
109
  if (!gpt_params_parse(argc, argv, params)) {
110
  return 1;
111
  }
112
- llama_sampling_params & sparams = params.sampling_params;
113
 
114
  #ifndef LOG_DISABLE_LOGS
115
  log_set_target(log_filename_generator("main", "log"));
@@ -180,7 +179,7 @@ int main(int argc, char ** argv) {
180
  // load the model and apply lora adapter, if any
181
  LOG("%s: load the model and apply lora adapter, if any\n", __func__);
182
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
183
- if (sparams.cfg_scale > 1.f) {
184
  struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
185
  ctx_guidance = llama_new_context_with_model(model, lparams);
186
  }
@@ -258,9 +257,9 @@ int main(int argc, char ** argv) {
258
  int guidance_offset = 0;
259
  int original_prompt_len = 0;
260
  if (ctx_guidance) {
261
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
262
 
263
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
264
  LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
265
 
266
  std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@@ -297,9 +296,6 @@ int main(int argc, char ** argv) {
297
  LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
298
  __func__, n_matching_session_tokens, embd_inp.size());
299
  }
300
-
301
- // remove any "future" tokens that we might have inherited from the previous session
302
- llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
303
  }
304
 
305
  LOGLN(
@@ -347,7 +343,7 @@ int main(int argc, char ** argv) {
347
 
348
  if (ctx_guidance) {
349
  LOG_TEE("\n");
350
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
351
  LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
352
  for (int i = 0; i < (int) guidance_inp.size(); i++) {
353
  LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -399,7 +395,7 @@ int main(int argc, char ** argv) {
399
  }
400
  }
401
  LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
402
- sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
403
  LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
404
  LOG_TEE("\n\n");
405
 
@@ -417,8 +413,8 @@ int main(int argc, char ** argv) {
417
  LOG_TEE("\n");
418
 
419
  {
420
- auto it = sparams.logit_bias.find(llama_token_eos(ctx));
421
- if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
422
  LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
423
  }
424
  }
@@ -473,7 +469,6 @@ int main(int argc, char ** argv) {
473
 
474
  const int n_vocab = llama_n_vocab(model);
475
 
476
- llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
477
  std::vector<llama_token_data> candidates;
478
  candidates.reserve(n_vocab);
479
 
@@ -548,6 +543,9 @@ int main(int argc, char ** argv) {
548
  if (i > 0) {
549
  embd.erase(embd.begin(), embd.begin() + i);
550
  }
 
 
 
551
  }
552
 
553
  // evaluate tokens in batches
@@ -627,7 +625,7 @@ int main(int argc, char ** argv) {
627
  LOG("saved session to %s\n", path_session.c_str());
628
  }
629
 
630
- const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
631
 
632
  last_tokens.erase(last_tokens.begin());
633
  last_tokens.push_back(id);
 
109
  if (!gpt_params_parse(argc, argv, params)) {
110
  return 1;
111
  }
 
112
 
113
  #ifndef LOG_DISABLE_LOGS
114
  log_set_target(log_filename_generator("main", "log"));
 
179
  // load the model and apply lora adapter, if any
180
  LOG("%s: load the model and apply lora adapter, if any\n", __func__);
181
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
182
+ if (params.cfg_scale > 1.f) {
183
  struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
184
  ctx_guidance = llama_new_context_with_model(model, lparams);
185
  }
 
257
  int guidance_offset = 0;
258
  int original_prompt_len = 0;
259
  if (ctx_guidance) {
260
+ LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
261
 
262
+ guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
263
  LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
264
 
265
  std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
 
296
  LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
297
  __func__, n_matching_session_tokens, embd_inp.size());
298
  }
 
 
 
299
  }
300
 
301
  LOGLN(
 
343
 
344
  if (ctx_guidance) {
345
  LOG_TEE("\n");
346
+ LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
347
  LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
348
  for (int i = 0; i < (int) guidance_inp.size(); i++) {
349
  LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
 
395
  }
396
  }
397
  LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
398
+ params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
399
  LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
400
  LOG_TEE("\n\n");
401
 
 
413
  LOG_TEE("\n");
414
 
415
  {
416
+ auto it = params.logit_bias.find(llama_token_eos(ctx));
417
+ if (it != params.logit_bias.end() && it->second == -INFINITY) {
418
  LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
419
  }
420
  }
 
469
 
470
  const int n_vocab = llama_n_vocab(model);
471
 
 
472
  std::vector<llama_token_data> candidates;
473
  candidates.reserve(n_vocab);
474
 
 
543
  if (i > 0) {
544
  embd.erase(embd.begin(), embd.begin() + i);
545
  }
546
+
547
+ // remove any "future" tokens that we might have inherited from the session from the KV cache
548
+ llama_kv_cache_tokens_rm(ctx, n_past, -1);
549
  }
550
 
551
  // evaluate tokens in batches
 
625
  LOG("saved session to %s\n", path_session.c_str());
626
  }
627
 
628
+ const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
629
 
630
  last_tokens.erase(last_tokens.begin());
631
  last_tokens.push_back(id);
examples/parallel/parallel.cpp CHANGED
@@ -125,8 +125,6 @@ int main(int argc, char ** argv) {
125
  params.logits_all = true;
126
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
127
 
128
- llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
129
-
130
  // load the prompts from an external file if there are any
131
  if (params.prompt.empty()) {
132
  printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -169,7 +167,7 @@ int main(int argc, char ** argv) {
169
 
170
  // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
171
  // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
172
- llama_batch batch = llama_batch_init(n_ctx, 0);
173
 
174
  int32_t n_total_prompt = 0;
175
  int32_t n_total_gen = 0;
@@ -341,7 +339,7 @@ int main(int argc, char ** argv) {
341
  //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
342
  // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
343
 
344
- const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
345
 
346
  if (client.n_decoded == 1) {
347
  // start measuring generation time after the first token to make sure all concurrent clients
@@ -386,7 +384,7 @@ int main(int argc, char ** argv) {
386
 
387
  n_total_prompt += client.n_prompt;
388
  n_total_gen += client.n_decoded;
389
- llama_sampling_context_reset(ctx_sampling, client.seq_id);
390
  client.seq_id = -1;
391
  }
392
 
 
125
  params.logits_all = true;
126
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
127
 
 
 
128
  // load the prompts from an external file if there are any
129
  if (params.prompt.empty()) {
130
  printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
 
167
 
168
  // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
169
  // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
170
+ llama_batch batch = llama_batch_init(params.n_ctx, 0);
171
 
172
  int32_t n_total_prompt = 0;
173
  int32_t n_total_gen = 0;
 
339
  //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
340
  // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
341
 
342
+ const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
343
 
344
  if (client.n_decoded == 1) {
345
  // start measuring generation time after the first token to make sure all concurrent clients
 
384
 
385
  n_total_prompt += client.n_prompt;
386
  n_total_gen += client.n_decoded;
387
+
388
  client.seq_id = -1;
389
  }
390
 
examples/save-load-state/save-load-state.cpp CHANGED
@@ -8,10 +8,9 @@
8
 
9
  int main(int argc, char ** argv) {
10
  gpt_params params;
11
- llama_sampling_params & sparams = params.sampling_params;
12
  params.seed = 42;
13
  params.n_threads = 4;
14
- sparams.repeat_last_n = 64;
15
  params.prompt = "The quick brown fox";
16
 
17
  if (!gpt_params_parse(argc, argv, params)) {
@@ -25,7 +24,7 @@ int main(int argc, char ** argv) {
25
  }
26
 
27
  auto n_past = 0;
28
- auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
29
 
30
  // init
31
  llama_model * model;
 
8
 
9
  int main(int argc, char ** argv) {
10
  gpt_params params;
 
11
  params.seed = 42;
12
  params.n_threads = 4;
13
+ params.repeat_last_n = 64;
14
  params.prompt = "The quick brown fox";
15
 
16
  if (!gpt_params_parse(argc, argv, params)) {
 
24
  }
25
 
26
  auto n_past = 0;
27
+ auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
28
 
29
  // init
30
  llama_model * model;
examples/server/index.html.hpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/server/public/index.html CHANGED
@@ -136,11 +136,6 @@
136
  display: block;
137
  }
138
 
139
- fieldset label.slim {
140
- margin: 0 0.5em;
141
- display: inline;
142
- }
143
-
144
  header, footer {
145
  text-align: center;
146
  }
@@ -150,14 +145,6 @@
150
  color: #888;
151
  }
152
 
153
- .mode-chat textarea[name=prompt] {
154
- height: 4.5em;
155
- }
156
-
157
- .mode-completion textarea[name=prompt] {
158
- height: 10em;
159
- }
160
-
161
 
162
  @keyframes loading-bg-wipe {
163
  0% {
@@ -200,7 +187,7 @@
200
  template: "{{prompt}}\n\n{{history}}\n{{char}}:",
201
  historyTemplate: "{{name}}: {{message}}",
202
  transcript: [],
203
- type: "chat", // "chat" | "completion"
204
  char: "Llama",
205
  user: "User",
206
  })
@@ -378,44 +365,13 @@
378
  return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
379
  }
380
 
381
- async function runLlama(prompt, llamaParams, char) {
382
- const currentMessages = [];
383
- const history = session.value.transcript;
384
- if (controller.value) {
385
- throw new Error("already running");
386
- }
387
- controller.value = new AbortController();
388
- for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
389
- const data = chunk.data;
390
-
391
- if (data.stop) {
392
- while (
393
- currentMessages.length > 0 &&
394
- currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
395
- ) {
396
- currentMessages.pop();
397
- }
398
- transcriptUpdate([...history, [char, currentMessages]])
399
- console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
400
- } else {
401
- currentMessages.push(data);
402
- transcriptUpdate([...history, [char, currentMessages]])
403
- }
404
-
405
- if (data.timings) {
406
- llamaStats.value = data.timings;
407
- }
408
- }
409
-
410
- controller.value = null;
411
- }
412
-
413
  // send message to server
414
  const chat = async (msg) => {
415
  if (controller.value) {
416
  console.log('already running...');
417
  return;
418
  }
 
419
 
420
  transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
421
 
@@ -435,41 +391,55 @@
435
  ).join("\n"),
436
  });
437
 
438
- await runLlama(prompt, {
 
 
 
439
  ...params.value,
440
  stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
441
- }, "{{char}}");
442
- }
443
-
444
- const runCompletion = async () => {
445
- if (controller.value) {
446
- console.log('already running...');
447
- return;
448
  }
449
- const {prompt} = session.value;
450
- transcriptUpdate([...session.value.transcript, ["", prompt]]);
451
- await runLlama(prompt, {
452
- ...params.value,
453
- stop: [],
454
- }, "");
455
- }
456
 
457
- const stop = (e) => {
458
- e.preventDefault();
459
- if (controller.value) {
460
- controller.value.abort();
461
- controller.value = null;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  }
463
- }
464
 
465
- const reset = (e) => {
466
- stop(e);
467
- transcriptUpdate([]);
468
  }
469
 
470
  function MessageInput() {
471
  const message = useSignal("")
472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  const submit = (e) => {
474
  stop(e);
475
  chat(message.value);
@@ -504,19 +474,6 @@
504
  `
505
  }
506
 
507
- function CompletionControls() {
508
- const submit = (e) => {
509
- stop(e);
510
- runCompletion();
511
- }
512
- return html`
513
- <div>
514
- <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
515
- <button onclick=${stop} disabled=${!generating.value}>Stop</button>
516
- <button onclick=${reset}>Reset</button>
517
- </div>`;
518
- }
519
-
520
  const ChatLog = (props) => {
521
  const messages = session.value.transcript;
522
  const container = useRef(null)
@@ -540,11 +497,7 @@
540
  data;
541
  message = html`<${Markdownish} text=${template(text)} />`
542
  }
543
- if(user) {
544
- return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
545
- } else {
546
- return html`<p key=${index}>${message}</p>`
547
- }
548
  };
549
 
550
  return html`
@@ -621,31 +574,18 @@
621
  userTemplateAutosave()
622
  }, [session.value, params.value])
623
 
624
- const GrammarControl = () => (
625
- html`
626
- <div>
627
- <label for="template">Grammar</label>
628
- <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
629
- <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
630
- <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
631
- </div>
632
- `
633
- );
634
-
635
- const PromptControlFieldSet = () => (
636
- html`
637
- <fieldset>
638
- <div>
639
- <label htmlFor="prompt">Prompt</label>
640
- <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
641
- </div>
642
- </fieldset>
643
- `
644
- );
645
 
646
- const ChatConfigForm = () => (
647
- html`
648
- ${PromptControlFieldSet()}
 
 
 
649
 
650
  <fieldset class="two">
651
  <div>
@@ -669,30 +609,15 @@
669
  <label for="template">Chat history template</label>
670
  <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
671
  </div>
672
- ${GrammarControl()}
673
- </fieldset>
674
- `
675
- );
676
-
677
- const CompletionConfigForm = () => (
678
- html`
679
- ${PromptControlFieldSet()}
680
- <fieldset>${GrammarControl()}</fieldset>
681
- `
682
- );
683
 
684
- return html`
685
- <form>
686
- <fieldset class="two">
687
- <${UserTemplateResetButton}/>
688
  <div>
689
- <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
690
- <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
 
 
691
  </div>
692
  </fieldset>
693
 
694
- ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
695
-
696
  <fieldset class="two">
697
  ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
698
  ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
@@ -926,7 +851,7 @@
926
  function App(props) {
927
 
928
  return html`
929
- <div class="mode-${session.value.type}">
930
  <header>
931
  <h1>llama.cpp</h1>
932
  </header>
@@ -936,7 +861,7 @@
936
  </main>
937
 
938
  <section id="write">
939
- <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
940
  </section>
941
 
942
  <footer>
 
136
  display: block;
137
  }
138
 
 
 
 
 
 
139
  header, footer {
140
  text-align: center;
141
  }
 
145
  color: #888;
146
  }
147
 
 
 
 
 
 
 
 
 
148
 
149
  @keyframes loading-bg-wipe {
150
  0% {
 
187
  template: "{{prompt}}\n\n{{history}}\n{{char}}:",
188
  historyTemplate: "{{name}}: {{message}}",
189
  transcript: [],
190
+ type: "chat",
191
  char: "Llama",
192
  user: "User",
193
  })
 
365
  return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
366
  }
367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  // send message to server
369
  const chat = async (msg) => {
370
  if (controller.value) {
371
  console.log('already running...');
372
  return;
373
  }
374
+ controller.value = new AbortController();
375
 
376
  transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
377
 
 
391
  ).join("\n"),
392
  });
393
 
394
+ const currentMessages = [];
395
+ const history = session.value.transcript
396
+
397
+ const llamaParams = {
398
  ...params.value,
399
  stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
 
 
 
 
 
 
 
400
  }
 
 
 
 
 
 
 
401
 
402
+ for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
403
+ const data = chunk.data;
404
+
405
+ if (data.stop) {
406
+ while (
407
+ currentMessages.length > 0 &&
408
+ currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
409
+ ) {
410
+ currentMessages.pop();
411
+ }
412
+ transcriptUpdate([...history, ["{{char}}", currentMessages]])
413
+ console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
414
+ } else {
415
+ currentMessages.push(data);
416
+ transcriptUpdate([...history, ["{{char}}", currentMessages]])
417
+ }
418
+
419
+ if (data.timings) {
420
+ llamaStats.value = data.timings;
421
+ }
422
  }
 
423
 
424
+ controller.value = null;
 
 
425
  }
426
 
427
  function MessageInput() {
428
  const message = useSignal("")
429
 
430
+ const stop = (e) => {
431
+ e.preventDefault();
432
+ if (controller.value) {
433
+ controller.value.abort();
434
+ controller.value = null;
435
+ }
436
+ }
437
+
438
+ const reset = (e) => {
439
+ stop(e);
440
+ transcriptUpdate([]);
441
+ }
442
+
443
  const submit = (e) => {
444
  stop(e);
445
  chat(message.value);
 
474
  `
475
  }
476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  const ChatLog = (props) => {
478
  const messages = session.value.transcript;
479
  const container = useRef(null)
 
497
  data;
498
  message = html`<${Markdownish} text=${template(text)} />`
499
  }
500
+ return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
 
 
 
 
501
  };
502
 
503
  return html`
 
574
  userTemplateAutosave()
575
  }, [session.value, params.value])
576
 
577
+ return html`
578
+ <form>
579
+ <fieldset>
580
+ <${UserTemplateResetButton}/>
581
+ </fieldset>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
 
583
+ <fieldset>
584
+ <div>
585
+ <label for="prompt">Prompt</label>
586
+ <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
587
+ </div>
588
+ </fieldset>
589
 
590
  <fieldset class="two">
591
  <div>
 
609
  <label for="template">Chat history template</label>
610
  <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
611
  </div>
 
 
 
 
 
 
 
 
 
 
 
612
 
 
 
 
 
613
  <div>
614
+ <label for="template">Grammar</label>
615
+ <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
616
+ <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
617
+ <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
618
  </div>
619
  </fieldset>
620
 
 
 
621
  <fieldset class="two">
622
  ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
623
  ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
 
851
  function App(props) {
852
 
853
  return html`
854
+ <div>
855
  <header>
856
  <h1>llama.cpp</h1>
857
  </header>
 
861
  </main>
862
 
863
  <section id="write">
864
+ <${MessageInput} />
865
  </section>
866
 
867
  <footer>
examples/server/server.cpp CHANGED
@@ -200,7 +200,6 @@ struct llama_server_context
200
  llama_model *model = nullptr;
201
  llama_context *ctx = nullptr;
202
  gpt_params params;
203
- llama_sampling_context ctx_sampling;
204
  int n_ctx;
205
 
206
  grammar_parser::parse_state parsed_grammar;
@@ -255,7 +254,6 @@ struct llama_server_context
255
  if (grammar != nullptr) {
256
  llama_grammar_free(grammar);
257
  grammar = nullptr;
258
- ctx_sampling = llama_sampling_context_init(params, NULL);
259
  }
260
  }
261
 
@@ -331,8 +329,8 @@ struct llama_server_context
331
  grammar_parser::print_grammar(stderr, parsed_grammar);
332
 
333
  {
334
- auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
335
- if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
336
  LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
337
  }
338
  }
@@ -341,26 +339,14 @@ struct llama_server_context
341
  grammar = llama_grammar_init(
342
  grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
343
  }
344
- ctx_sampling = llama_sampling_context_init(params, grammar);
345
  return true;
346
  }
347
 
348
  void loadInfill()
349
  {
350
- bool suff_rm_leading_spc = true;
351
- if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
352
- params.input_suffix.erase(0, 1);
353
- suff_rm_leading_spc = false;
354
- }
355
-
356
- auto prefix_tokens = tokenize(params.input_prefix, false);
357
- auto suffix_tokens = tokenize(params.input_suffix, false);
358
- const int space_token = 29871;
359
- if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
360
- suffix_tokens.erase(suffix_tokens.begin());
361
- }
362
  prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
363
- prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
364
  prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
365
  prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
366
  prefix_tokens.push_back(llama_token_middle(ctx));
@@ -405,7 +391,6 @@ struct llama_server_context
405
  // compare the evaluated prompt with the new prompt
406
  n_past = common_part(embd, prompt_tokens);
407
  embd = prompt_tokens;
408
-
409
  if (n_past == num_prompt_tokens)
410
  {
411
  // we have to evaluate at least 1 token to generate logits.
@@ -413,9 +398,6 @@ struct llama_server_context
413
  n_past--;
414
  }
415
 
416
- // since #3228 we now have to manually manage the KV cache
417
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
418
-
419
  LOG_VERBOSE("prompt ingested", {
420
  {"n_past", n_past},
421
  {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -465,6 +447,9 @@ struct llama_server_context
465
  // compare the evaluated prompt with the new prompt
466
  n_past = common_part(embd, prompt_tokens);
467
 
 
 
 
468
  embd = prompt_tokens;
469
  if (n_past == num_prompt_tokens)
470
  {
@@ -472,9 +457,6 @@ struct llama_server_context
472
  n_past--;
473
  }
474
 
475
- // since #3228 we now have to manually manage the KV cache
476
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
477
-
478
  LOG_VERBOSE("prompt ingested", {
479
  {"n_past", n_past},
480
  {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -557,12 +539,12 @@ struct llama_server_context
557
  std::vector<llama_token_data> candidates;
558
  candidates.reserve(llama_n_vocab(model));
559
 
560
- result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
561
 
562
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
563
 
564
- const int32_t n_probs = params.sampling_params.n_probs;
565
- if (params.sampling_params.temp <= 0 && n_probs > 0)
566
  {
567
  // For llama_sample_token_greedy we need to sort candidates
568
  llama_sample_softmax(ctx, &candidates_p);
@@ -637,7 +619,7 @@ struct llama_server_context
637
  const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
638
  generated_text += token_text;
639
 
640
- if (params.sampling_params.n_probs > 0)
641
  {
642
  generated_token_probs.push_back(token_with_probs);
643
  }
@@ -718,16 +700,15 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
718
  printf("usage: %s [options]\n", argv0);
719
  printf("\n");
720
  printf("options:\n");
721
- printf(" -h, --help show this help message and exit\n");
722
- printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
723
- printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
724
- printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
725
- printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
726
- printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
727
- printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
728
- printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
729
- printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
730
- printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
731
  if (llama_mlock_supported())
732
  {
733
  printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
@@ -872,15 +853,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
872
  }
873
  params.n_threads = std::stoi(argv[i]);
874
  }
875
- else if (arg == "--threads-batch" || arg == "-tb")
876
- {
877
- if (++i >= argc)
878
- {
879
- invalid_param = true;
880
- break;
881
- }
882
- params.n_threads_batch = std::stoi(argv[i]);
883
- }
884
  else if (arg == "-b" || arg == "--batch-size")
885
  {
886
  if (++i >= argc)
@@ -1035,35 +1007,34 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
1035
 
1036
  static json format_generation_settings(llama_server_context &llama)
1037
  {
1038
- const auto & sparams = llama.params.sampling_params;
1039
- const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
1040
- const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
1041
  eos_bias->second < 0.0f && std::isinf(eos_bias->second);
1042
 
1043
  return json{
1044
  {"n_ctx", llama.n_ctx},
1045
  {"model", llama.params.model_alias},
1046
  {"seed", llama.params.seed},
1047
- {"temp", sparams.temp},
1048
- {"top_k", sparams.top_k},
1049
- {"top_p", sparams.top_p},
1050
- {"tfs_z", sparams.tfs_z},
1051
- {"typical_p", sparams.typical_p},
1052
- {"repeat_last_n", sparams.repeat_last_n},
1053
- {"repeat_penalty", sparams.repeat_penalty},
1054
- {"presence_penalty", sparams.presence_penalty},
1055
- {"frequency_penalty", sparams.frequency_penalty},
1056
- {"mirostat", sparams.mirostat},
1057
- {"mirostat_tau", sparams.mirostat_tau},
1058
- {"mirostat_eta", sparams.mirostat_eta},
1059
- {"penalize_nl", sparams.penalize_nl},
1060
  {"stop", llama.params.antiprompt},
1061
  {"n_predict", llama.params.n_predict},
1062
  {"n_keep", llama.params.n_keep},
1063
  {"ignore_eos", ignore_eos},
1064
  {"stream", llama.stream},
1065
- {"logit_bias", sparams.logit_bias},
1066
- {"n_probs", sparams.n_probs},
1067
  {"grammar", llama.params.grammar},
1068
  };
1069
  }
@@ -1112,7 +1083,7 @@ static json format_final_response(llama_server_context &llama, const std::string
1112
  {"timings", format_timings(llama)},
1113
  };
1114
 
1115
- if (llama.params.sampling_params.n_probs > 0)
1116
  {
1117
  res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
1118
  }
@@ -1128,7 +1099,7 @@ static json format_partial_response(
1128
  {"stop", false},
1129
  };
1130
 
1131
- if (llama.params.sampling_params.n_probs > 0)
1132
  {
1133
  res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
1134
  }
@@ -1160,28 +1131,26 @@ static T json_value(const json &body, const std::string &key, const T &default_v
1160
  static void parse_options_completion(const json &body, llama_server_context &llama)
1161
  {
1162
  gpt_params default_params;
1163
- const auto & default_sparams = default_params.sampling_params;
1164
- auto & sparams = llama.params.sampling_params;
1165
 
1166
  llama.stream = json_value(body, "stream", false);
1167
  llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
1168
- sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
1169
- sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
1170
- sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
1171
- sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
1172
- sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
1173
- sparams.temp = json_value(body, "temperature", default_sparams.temp);
1174
- sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
1175
- sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
1176
- sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
1177
- sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
1178
- sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
1179
- sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
1180
- sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
1181
  llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
1182
  llama.params.seed = json_value(body, "seed", default_params.seed);
1183
  llama.params.grammar = json_value(body, "grammar", default_params.grammar);
1184
- sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
1185
 
1186
  if (body.count("prompt") != 0)
1187
  {
@@ -1192,10 +1161,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
1192
  llama.prompt = "";
1193
  }
1194
 
1195
- sparams.logit_bias.clear();
1196
  if (json_value(body, "ignore_eos", false))
1197
  {
1198
- sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
1199
  }
1200
 
1201
  const auto &logit_bias = body.find("logit_bias");
@@ -1211,11 +1180,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
1211
  {
1212
  if (el[1].is_number())
1213
  {
1214
- sparams.logit_bias[tok] = el[1].get<float>();
1215
  }
1216
  else if (el[1].is_boolean() && !el[1].get<bool>())
1217
  {
1218
- sparams.logit_bias[tok] = -INFINITY;
1219
  }
1220
  }
1221
  }
@@ -1235,8 +1204,6 @@ static void parse_options_completion(const json &body, llama_server_context &lla
1235
  }
1236
  }
1237
 
1238
- llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
1239
-
1240
  LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
1241
  }
1242
 
@@ -1445,7 +1412,7 @@ int main(int argc, char **argv)
1445
  }
1446
 
1447
  auto probs = llama.generated_token_probs;
1448
- if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
1449
  const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
1450
  probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
1451
  }
@@ -1497,7 +1464,7 @@ int main(int argc, char **argv)
1497
 
1498
  std::vector<completion_token_output> probs_output = {};
1499
 
1500
- if (llama.params.sampling_params.n_probs > 0) {
1501
  const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
1502
  size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
1503
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1618,7 +1585,7 @@ int main(int argc, char **argv)
1618
 
1619
  std::vector<completion_token_output> probs_output = {};
1620
 
1621
- if (llama.params.sampling_params.n_probs > 0) {
1622
  const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
1623
  size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
1624
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
 
200
  llama_model *model = nullptr;
201
  llama_context *ctx = nullptr;
202
  gpt_params params;
 
203
  int n_ctx;
204
 
205
  grammar_parser::parse_state parsed_grammar;
 
254
  if (grammar != nullptr) {
255
  llama_grammar_free(grammar);
256
  grammar = nullptr;
 
257
  }
258
  }
259
 
 
329
  grammar_parser::print_grammar(stderr, parsed_grammar);
330
 
331
  {
332
+ auto it = params.logit_bias.find(llama_token_eos(ctx));
333
+ if (it != params.logit_bias.end() && it->second == -INFINITY) {
334
  LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
335
  }
336
  }
 
339
  grammar = llama_grammar_init(
340
  grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
341
  }
 
342
  return true;
343
  }
344
 
345
  void loadInfill()
346
  {
347
+ auto prefix_tokens = tokenize(params.input_prefix, true); // always add BOS
348
+ auto suffix_tokens = tokenize(params.input_suffix, true); // always add BOS
 
 
 
 
 
 
 
 
 
 
349
  prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
 
350
  prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
351
  prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
352
  prefix_tokens.push_back(llama_token_middle(ctx));
 
391
  // compare the evaluated prompt with the new prompt
392
  n_past = common_part(embd, prompt_tokens);
393
  embd = prompt_tokens;
 
394
  if (n_past == num_prompt_tokens)
395
  {
396
  // we have to evaluate at least 1 token to generate logits.
 
398
  n_past--;
399
  }
400
 
 
 
 
401
  LOG_VERBOSE("prompt ingested", {
402
  {"n_past", n_past},
403
  {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
 
447
  // compare the evaluated prompt with the new prompt
448
  n_past = common_part(embd, prompt_tokens);
449
 
450
+ // since #3228 we now have to manually manage the KV cache
451
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
452
+
453
  embd = prompt_tokens;
454
  if (n_past == num_prompt_tokens)
455
  {
 
457
  n_past--;
458
  }
459
 
 
 
 
460
  LOG_VERBOSE("prompt ingested", {
461
  {"n_past", n_past},
462
  {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
 
539
  std::vector<llama_token_data> candidates;
540
  candidates.reserve(llama_n_vocab(model));
541
 
542
+ result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
543
 
544
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
545
 
546
+ const int32_t n_probs = params.n_probs;
547
+ if (params.temp <= 0 && n_probs > 0)
548
  {
549
  // For llama_sample_token_greedy we need to sort candidates
550
  llama_sample_softmax(ctx, &candidates_p);
 
619
  const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
620
  generated_text += token_text;
621
 
622
+ if (params.n_probs > 0)
623
  {
624
  generated_token_probs.push_back(token_with_probs);
625
  }
 
700
  printf("usage: %s [options]\n", argv0);
701
  printf("\n");
702
  printf("options:\n");
703
+ printf(" -h, --help show this help message and exit\n");
704
+ printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
705
+ printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
706
+ printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
707
+ printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
708
+ printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
709
+ printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
710
+ printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
711
+ printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
 
712
  if (llama_mlock_supported())
713
  {
714
  printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
 
853
  }
854
  params.n_threads = std::stoi(argv[i]);
855
  }
 
 
 
 
 
 
 
 
 
856
  else if (arg == "-b" || arg == "--batch-size")
857
  {
858
  if (++i >= argc)
 
1007
 
1008
  static json format_generation_settings(llama_server_context &llama)
1009
  {
1010
+ const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
1011
+ const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
 
1012
  eos_bias->second < 0.0f && std::isinf(eos_bias->second);
1013
 
1014
  return json{
1015
  {"n_ctx", llama.n_ctx},
1016
  {"model", llama.params.model_alias},
1017
  {"seed", llama.params.seed},
1018
+ {"temp", llama.params.temp},
1019
+ {"top_k", llama.params.top_k},
1020
+ {"top_p", llama.params.top_p},
1021
+ {"tfs_z", llama.params.tfs_z},
1022
+ {"typical_p", llama.params.typical_p},
1023
+ {"repeat_last_n", llama.params.repeat_last_n},
1024
+ {"repeat_penalty", llama.params.repeat_penalty},
1025
+ {"presence_penalty", llama.params.presence_penalty},
1026
+ {"frequency_penalty", llama.params.frequency_penalty},
1027
+ {"mirostat", llama.params.mirostat},
1028
+ {"mirostat_tau", llama.params.mirostat_tau},
1029
+ {"mirostat_eta", llama.params.mirostat_eta},
1030
+ {"penalize_nl", llama.params.penalize_nl},
1031
  {"stop", llama.params.antiprompt},
1032
  {"n_predict", llama.params.n_predict},
1033
  {"n_keep", llama.params.n_keep},
1034
  {"ignore_eos", ignore_eos},
1035
  {"stream", llama.stream},
1036
+ {"logit_bias", llama.params.logit_bias},
1037
+ {"n_probs", llama.params.n_probs},
1038
  {"grammar", llama.params.grammar},
1039
  };
1040
  }
 
1083
  {"timings", format_timings(llama)},
1084
  };
1085
 
1086
+ if (llama.params.n_probs > 0)
1087
  {
1088
  res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
1089
  }
 
1099
  {"stop", false},
1100
  };
1101
 
1102
+ if (llama.params.n_probs > 0)
1103
  {
1104
  res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
1105
  }
 
1131
  static void parse_options_completion(const json &body, llama_server_context &llama)
1132
  {
1133
  gpt_params default_params;
 
 
1134
 
1135
  llama.stream = json_value(body, "stream", false);
1136
  llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
1137
+ llama.params.top_k = json_value(body, "top_k", default_params.top_k);
1138
+ llama.params.top_p = json_value(body, "top_p", default_params.top_p);
1139
+ llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
1140
+ llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
1141
+ llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
1142
+ llama.params.temp = json_value(body, "temperature", default_params.temp);
1143
+ llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
1144
+ llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
1145
+ llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
1146
+ llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
1147
+ llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
1148
+ llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
1149
+ llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
1150
  llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
1151
  llama.params.seed = json_value(body, "seed", default_params.seed);
1152
  llama.params.grammar = json_value(body, "grammar", default_params.grammar);
1153
+ llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
1154
 
1155
  if (body.count("prompt") != 0)
1156
  {
 
1161
  llama.prompt = "";
1162
  }
1163
 
1164
+ llama.params.logit_bias.clear();
1165
  if (json_value(body, "ignore_eos", false))
1166
  {
1167
+ llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
1168
  }
1169
 
1170
  const auto &logit_bias = body.find("logit_bias");
 
1180
  {
1181
  if (el[1].is_number())
1182
  {
1183
+ llama.params.logit_bias[tok] = el[1].get<float>();
1184
  }
1185
  else if (el[1].is_boolean() && !el[1].get<bool>())
1186
  {
1187
+ llama.params.logit_bias[tok] = -INFINITY;
1188
  }
1189
  }
1190
  }
 
1204
  }
1205
  }
1206
 
 
 
1207
  LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
1208
  }
1209
 
 
1412
  }
1413
 
1414
  auto probs = llama.generated_token_probs;
1415
+ if (llama.params.n_probs > 0 && llama.stopped_word) {
1416
  const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
1417
  probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
1418
  }
 
1464
 
1465
  std::vector<completion_token_output> probs_output = {};
1466
 
1467
+ if (llama.params.n_probs > 0) {
1468
  const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
1469
  size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
1470
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
 
1585
 
1586
  std::vector<completion_token_output> probs_output = {};
1587
 
1588
+ if (llama.params.n_probs > 0) {
1589
  const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
1590
  size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
1591
  size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
examples/speculative/speculative.cpp CHANGED
@@ -125,8 +125,6 @@ int main(int argc, char ** argv) {
125
  grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
126
  }
127
 
128
- llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
129
-
130
  const auto t_dec_start = ggml_time_us();
131
 
132
  while (true) {
@@ -136,7 +134,7 @@ int main(int argc, char ** argv) {
136
 
137
  while (true) {
138
  // sample from the target model
139
- llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
140
 
141
  // remember which tokens were sampled - used for repetition penalties during sampling
142
  last_tokens.erase(last_tokens.begin());
@@ -213,13 +211,7 @@ int main(int argc, char ** argv) {
213
  if (grammar_dft) {
214
  llama_grammar_free(grammar_dft);
215
  }
216
- // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
217
- // that will need to change.
218
- auto it = ctx_sampling.sequence_contexts.find(0);
219
- GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
220
- // This is necessary because each sequence id in sequence_contexts
221
- // uses a copy of the original grammar.
222
- grammar_dft = llama_grammar_copy(it->second.grammar);
223
 
224
  LOG("copied target grammar to draft grammar\n");
225
  }
 
125
  grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
126
  }
127
 
 
 
128
  const auto t_dec_start = ggml_time_us();
129
 
130
  while (true) {
 
134
 
135
  while (true) {
136
  // sample from the target model
137
+ llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
138
 
139
  // remember which tokens were sampled - used for repetition penalties during sampling
140
  last_tokens.erase(last_tokens.begin());
 
211
  if (grammar_dft) {
212
  llama_grammar_free(grammar_dft);
213
  }
214
+ grammar_dft = llama_grammar_copy(grammar_tgt);
 
 
 
 
 
 
215
 
216
  LOG("copied target grammar to draft grammar\n");
217
  }
ggml-alloc.c CHANGED
@@ -1,5 +1,4 @@
1
  #include "ggml-alloc.h"
2
- #include "ggml-backend.h"
3
  #include "ggml.h"
4
  #include <assert.h>
5
  #include <stdarg.h>
@@ -7,6 +6,25 @@
7
  #include <stdlib.h>
8
  #include <string.h>
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  #define UNUSED(x) (void)(x)
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -62,9 +80,8 @@ struct free_block {
62
  #define MAX_FREE_BLOCKS 256
63
 
64
  struct ggml_allocr {
65
- struct ggml_backend_buffer * buffer;
66
- bool buffer_owned;
67
  void * data;
 
68
  size_t alignment;
69
  int n_free_blocks;
70
  struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -102,9 +119,16 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
102
  }
103
  #endif
104
 
 
 
 
 
 
 
105
  // check if a tensor is allocated by this buffer
106
  static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
107
- return tensor->buffer == alloc->buffer;
 
108
  }
109
 
110
  static bool ggml_is_view(struct ggml_tensor * t) {
@@ -112,10 +136,11 @@ static bool ggml_is_view(struct ggml_tensor * t) {
112
  }
113
 
114
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 
115
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
116
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
117
-
118
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
119
  size = aligned_offset(NULL, size, alloc->alignment);
120
 
121
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -163,8 +188,6 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
163
 
164
  tensor->data = addr;
165
  AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
166
- tensor->buffer = alloc->buffer;
167
- ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
168
 
169
  #ifdef GGML_ALLOCATOR_DEBUG
170
  add_allocated_tensor(alloc, tensor);
@@ -185,21 +208,19 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
185
 
186
  // this is a very naive implementation, but for our case the number of free blocks should be very small
187
  static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 
 
188
  if (ggml_allocr_is_own(alloc, tensor) == false) {
189
  // the tensor was not allocated in this buffer
190
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
191
  // the easiest way to deal with this is just to ignore it
192
- AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
193
  return;
194
  }
195
 
196
- void * ptr = tensor->data;
197
-
198
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
199
  size = aligned_offset(NULL, size, alloc->alignment);
200
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
201
-
202
- ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
203
 
204
  #ifdef GGML_ALLOCATOR_DEBUG
205
  remove_allocated_tensor(alloc, tensor);
@@ -264,18 +285,15 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
264
  alloc->n_free_blocks = 1;
265
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
266
  alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
267
- alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
268
  }
269
 
270
  struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
271
- struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
272
-
273
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
274
 
275
  *alloc = (struct ggml_allocr){
276
- /*.buffer = */ buffer,
277
- /*.buffer_owned = */ true,
278
- /*.base = */ ggml_backend_buffer_get_base(buffer),
279
  /*.alignment = */ alignment,
280
  /*.n_free_blocks = */ 0,
281
  /*.free_blocks = */ {{0}},
@@ -294,26 +312,74 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
294
  return alloc;
295
  }
296
 
297
- struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298
- struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
299
- alloc->measure = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
- return alloc;
 
 
 
 
 
 
 
 
 
 
302
  }
303
 
304
- struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
305
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
  *alloc = (struct ggml_allocr){
308
- /*.buffer = */ buffer,
309
- /*.buffer_owned = */ false,
310
- /*.base = */ ggml_backend_buffer_get_base(buffer),
311
- /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
312
  /*.n_free_blocks = */ 0,
313
  /*.free_blocks = */ {{0}},
314
  /*.hash_table = */ {{0}},
315
  /*.max_size = */ 0,
316
- /*.measure = */ false,
317
  /*.parse_seq = */ {0},
318
  /*.parse_seq_len = */ 0,
319
  #ifdef GGML_ALLOCATOR_DEBUG
@@ -327,8 +393,8 @@ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * bu
327
  }
328
 
329
  void ggml_allocr_free(struct ggml_allocr * alloc) {
330
- if (alloc->buffer_owned) {
331
- ggml_backend_buffer_free(alloc->buffer);
332
  }
333
  free(alloc);
334
  }
@@ -371,6 +437,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
371
  case GGML_OP_ROPE:
372
  case GGML_OP_RMS_NORM:
373
  case GGML_OP_SOFT_MAX:
 
374
  return true;
375
 
376
  default:
@@ -378,23 +445,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
378
  }
379
  }
380
 
381
- static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
382
- assert(view->view_src != NULL && view->view_src->data != NULL);
383
- view->backend = view->view_src->backend;
384
- view->buffer = view->view_src->buffer;
385
- view->data = (char *)view->view_src->data + view->view_offs;
386
-
387
- // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
388
- // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
389
- assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
390
- ggml_backend_buffer_init_tensor(alloc->buffer, view);
391
- }
392
-
393
  static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
394
  struct hash_node * ht = alloc->hash_table;
395
  if (node->data == NULL) {
396
  if (ggml_is_view(node)) {
397
- init_view(alloc, node);
 
398
  } else {
399
  // see if we can reuse a parent's buffer (inplace)
400
  if (ggml_op_can_inplace(node->op)) {
@@ -422,17 +478,13 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
422
  // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
423
  // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
424
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
425
- node->view_src = view_src;
426
- view_src_hn->n_views += 1;
427
- init_view(alloc, node);
428
  return;
429
  }
430
  }
431
  else {
432
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
433
- node->view_src = parent;
434
- p_hn->n_views += 1;
435
- init_view(alloc, node);
436
  return;
437
  }
438
  }
@@ -443,7 +495,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
443
  }
444
  }
445
 
446
- size_t ggml_allocr_alloc_graph_n(
447
  struct ggml_allocr * alloc,
448
  struct ggml_cgraph ** graphs, int n_graphs,
449
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -461,10 +513,6 @@ size_t ggml_allocr_alloc_graph_n(
461
  if (ggml_is_view(node)) {
462
  struct ggml_tensor * view_src = node->view_src;
463
  hash_get(ht, view_src)->n_views += 1;
464
- if (node->buffer == NULL && node->data != NULL) {
465
- // view of a pre-allocated tensor, didn't call init_view() yet
466
- init_view(alloc, node);
467
- }
468
  }
469
 
470
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -473,9 +521,6 @@ size_t ggml_allocr_alloc_graph_n(
473
  break;
474
  }
475
  hash_get(ht, parent)->n_children += 1;
476
- if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
477
- init_view(alloc, parent);
478
- }
479
  }
480
  }
481
  }
@@ -586,7 +631,7 @@ size_t ggml_allocr_alloc_graph_n(
586
  }
587
 
588
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
589
- return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
590
  }
591
 
592
  size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
 
1
  #include "ggml-alloc.h"
 
2
  #include "ggml.h"
3
  #include <assert.h>
4
  #include <stdarg.h>
 
6
  #include <stdlib.h>
7
  #include <string.h>
8
 
9
+ #ifdef __has_include
10
+ #if __has_include(<unistd.h>)
11
+ #include <unistd.h>
12
+ #if defined(_POSIX_MAPPED_FILES)
13
+ #include <sys/types.h>
14
+ #include <sys/mman.h>
15
+ #endif
16
+ #endif
17
+ #endif
18
+
19
+ #if defined(_WIN32)
20
+ #define WIN32_LEAN_AND_MEAN
21
+ #ifndef NOMINMAX
22
+ #define NOMINMAX
23
+ #endif
24
+ #include <windows.h>
25
+ #include <memoryapi.h>
26
+ #endif
27
+
28
 
29
  #define UNUSED(x) (void)(x)
30
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
80
  #define MAX_FREE_BLOCKS 256
81
 
82
  struct ggml_allocr {
 
 
83
  void * data;
84
+ size_t size;
85
  size_t alignment;
86
  int n_free_blocks;
87
  struct free_block free_blocks[MAX_FREE_BLOCKS];
 
119
  }
120
  #endif
121
 
122
+ static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
123
+ return ggml_nbytes(tensor);
124
+
125
+ UNUSED(alloc);
126
+ }
127
+
128
  // check if a tensor is allocated by this buffer
129
  static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
130
+ void * ptr = tensor->data;
131
+ return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
132
  }
133
 
134
  static bool ggml_is_view(struct ggml_tensor * t) {
 
136
  }
137
 
138
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
139
+ #ifdef GGML_ALLOCATOR_DEBUG
140
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
141
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
142
+ #endif
143
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
144
  size = aligned_offset(NULL, size, alloc->alignment);
145
 
146
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 
188
 
189
  tensor->data = addr;
190
  AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
 
 
191
 
192
  #ifdef GGML_ALLOCATOR_DEBUG
193
  add_allocated_tensor(alloc, tensor);
 
208
 
209
  // this is a very naive implementation, but for our case the number of free blocks should be very small
210
  static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
211
+ void * ptr = tensor->data;
212
+
213
  if (ggml_allocr_is_own(alloc, tensor) == false) {
214
  // the tensor was not allocated in this buffer
215
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
216
  // the easiest way to deal with this is just to ignore it
 
217
  return;
218
  }
219
 
220
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
 
 
221
  size = aligned_offset(NULL, size, alloc->alignment);
222
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
223
+ AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
 
224
 
225
  #ifdef GGML_ALLOCATOR_DEBUG
226
  remove_allocated_tensor(alloc, tensor);
 
285
  alloc->n_free_blocks = 1;
286
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
287
  alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
288
+ alloc->free_blocks[0].size = alloc->size - align_offset;
289
  }
290
 
291
  struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
292
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
 
 
293
 
294
  *alloc = (struct ggml_allocr){
295
+ /*.data = */ data,
296
+ /*.size = */ size,
 
297
  /*.alignment = */ alignment,
298
  /*.n_free_blocks = */ 0,
299
  /*.free_blocks = */ {{0}},
 
312
  return alloc;
313
  }
314
 
315
+ // OS specific functions to allocate and free uncommitted virtual memory
316
+ static void * alloc_vmem(size_t size) {
317
+ #if defined(_WIN32)
318
+ return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
319
+ #elif defined(_POSIX_MAPPED_FILES)
320
+ void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
321
+ if (ptr == MAP_FAILED) {
322
+ return NULL;
323
+ }
324
+ return ptr;
325
+ #else
326
+ // use a fixed address for other platforms
327
+ uintptr_t base_addr = (uintptr_t)-size - 0x100;
328
+ return (void *)base_addr;
329
+ #endif
330
+ }
331
 
332
+ static void free_vmem(void * base_addr, size_t size) {
333
+ #if defined(_WIN32)
334
+ VirtualFree(base_addr, 0, MEM_RELEASE);
335
+ UNUSED(size);
336
+ #elif defined(_POSIX_MAPPED_FILES)
337
+ munmap(base_addr, size);
338
+ #else
339
+ // nothing to do
340
+ UNUSED(base_addr);
341
+ UNUSED(size);
342
+ #endif
343
  }
344
 
345
+ // allocate uncommitted virtual memory to measure the size of the graph
346
+ static void alloc_measure_vmem(void ** base_addr, size_t * size) {
347
+ // 128GB for 64-bit, 1GB for 32-bit
348
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
349
+ do {
350
+ *base_addr = alloc_vmem(*size);
351
+ if (*base_addr != NULL) {
352
+ AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
353
+ return;
354
+ }
355
+ // try again with half the size
356
+ *size /= 2;
357
+ } while (*size > 0);
358
+
359
+ GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
360
+ }
361
+
362
+ static void free_measure_vmem(void * base_addr, size_t size) {
363
+ free_vmem(base_addr, size);
364
+ }
365
+
366
+ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
367
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
368
+
369
+ void * base_addr;
370
+ size_t size;
371
+
372
+ alloc_measure_vmem(&base_addr, &size);
373
 
374
  *alloc = (struct ggml_allocr){
375
+ /*.data = */ base_addr,
376
+ /*.size = */ size,
377
+ /*.alignment = */ alignment,
 
378
  /*.n_free_blocks = */ 0,
379
  /*.free_blocks = */ {{0}},
380
  /*.hash_table = */ {{0}},
381
  /*.max_size = */ 0,
382
+ /*.measure = */ true,
383
  /*.parse_seq = */ {0},
384
  /*.parse_seq_len = */ 0,
385
  #ifdef GGML_ALLOCATOR_DEBUG
 
393
  }
394
 
395
  void ggml_allocr_free(struct ggml_allocr * alloc) {
396
+ if (alloc->measure) {
397
+ free_measure_vmem(alloc->data, alloc->size);
398
  }
399
  free(alloc);
400
  }
 
437
  case GGML_OP_ROPE:
438
  case GGML_OP_RMS_NORM:
439
  case GGML_OP_SOFT_MAX:
440
+ case GGML_OP_CONT:
441
  return true;
442
 
443
  default:
 
445
  }
446
  }
447
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
449
  struct hash_node * ht = alloc->hash_table;
450
  if (node->data == NULL) {
451
  if (ggml_is_view(node)) {
452
+ assert(node->view_src->data != NULL);
453
+ node->data = (char *)node->view_src->data + node->view_offs;
454
  } else {
455
  // see if we can reuse a parent's buffer (inplace)
456
  if (ggml_op_can_inplace(node->op)) {
 
478
  // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
479
  // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
480
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
481
+ node->data = parent->data;
 
 
482
  return;
483
  }
484
  }
485
  else {
486
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
487
+ node->data = parent->data;
 
 
488
  return;
489
  }
490
  }
 
495
  }
496
  }
497
 
498
+ static size_t ggml_allocr_alloc_graph_tensors_n(
499
  struct ggml_allocr * alloc,
500
  struct ggml_cgraph ** graphs, int n_graphs,
501
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
 
513
  if (ggml_is_view(node)) {
514
  struct ggml_tensor * view_src = node->view_src;
515
  hash_get(ht, view_src)->n_views += 1;
 
 
 
 
516
  }
517
 
518
  for (int j = 0; j < GGML_MAX_SRC; j++) {
 
521
  break;
522
  }
523
  hash_get(ht, parent)->n_children += 1;
 
 
 
524
  }
525
  }
526
  }
 
631
  }
632
 
633
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
634
+ return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
635
  }
636
 
637
  size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
ggml-alloc.h CHANGED
@@ -6,27 +6,21 @@
6
  extern "C" {
7
  #endif
8
 
9
- struct ggml_backend_buffer;
10
 
11
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
12
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
13
- GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
14
 
15
  // tell the allocator to parse nodes following the order described in the list
16
  // you should call this if your graph are optimized to execute out-of-order
17
  GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
18
 
19
- GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
20
- GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
21
- GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
22
- GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
23
  GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
24
- GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
25
 
26
- GGML_API size_t ggml_allocr_alloc_graph_n(
27
- struct ggml_allocr * alloc,
28
- struct ggml_cgraph ** graphs, int n_graphs,
29
- struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
30
 
31
  #ifdef __cplusplus
32
  }
 
6
  extern "C" {
7
  #endif
8
 
 
9
 
10
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 
12
 
13
  // tell the allocator to parse nodes following the order described in the list
14
  // you should call this if your graph are optimized to execute out-of-order
15
  GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
 
17
+ GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
+ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
19
+ GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
20
+ GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
21
  GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
22
+ GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
23
 
 
 
 
 
24
 
25
  #ifdef __cplusplus
26
  }
ggml-cuda.cu CHANGED
@@ -62,7 +62,6 @@
62
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
63
  #define cudaMemcpyKind hipMemcpyKind
64
  #define cudaMemset hipMemset
65
- #define cudaMemsetAsync hipMemsetAsync
66
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
67
  #define cudaSetDevice hipSetDevice
68
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -415,13 +414,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
415
  #define CUDA_SILU_BLOCK_SIZE 256
416
  #define CUDA_CPY_BLOCK_SIZE 32
417
  #define CUDA_SCALE_BLOCK_SIZE 256
418
- #define CUDA_CLAMP_BLOCK_SIZE 256
419
  #define CUDA_ROPE_BLOCK_SIZE 256
420
  #define CUDA_ALIBI_BLOCK_SIZE 32
421
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
422
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
423
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
424
- #define CUDA_GET_ROWS_BLOCK_SIZE 256
425
 
426
  // dmmv = dequantize_mul_mat_vec
427
  #ifndef GGML_CUDA_DMMV_X
@@ -1577,34 +1574,6 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1577
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1578
  }
1579
 
1580
- template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1581
- static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
1582
- const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
1583
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
1584
-
1585
- if (col >= ncols) {
1586
- return;
1587
- }
1588
-
1589
- const int r = y[row];
1590
-
1591
- // copy x[r*ncols + col] to dst[row*ncols + col]
1592
- const int xi = r*ncols + col;
1593
- const int di = row*ncols + col;
1594
-
1595
- const int ib = xi/qk; // block index
1596
- const int iqs = (xi%qk)/qr; // quant index
1597
- const int iybs = di - di%qk; // y block start index
1598
- const int y_offset = qr == 1 ? 1 : qk/2;
1599
-
1600
- // dequantize
1601
- dfloat2 v;
1602
- dequantize_kernel(x, ib, iqs, v);
1603
-
1604
- dst[iybs + iqs + 0] = v.x;
1605
- dst[iybs + iqs + y_offset] = v.y;
1606
- }
1607
-
1608
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1609
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1610
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -4586,24 +4555,6 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
4586
  dst[i] = scale * x[i];
4587
  }
4588
 
4589
- static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
4590
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
4591
-
4592
- if (i >= k) {
4593
- return;
4594
- }
4595
-
4596
- dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
4597
- }
4598
-
4599
- template<int qk, int qr, dequantize_kernel_t dq>
4600
- static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
4601
- const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
4602
- const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
4603
- const dim3 block_nums(block_num_x, nrows, 1);
4604
- k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
4605
- }
4606
-
4607
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4608
  const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4609
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5485,11 +5436,6 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5485
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5486
  }
5487
 
5488
- static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
5489
- const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
5490
- clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
5491
- }
5492
-
5493
  template<typename T>
5494
  static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5495
  const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
@@ -5753,7 +5699,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5753
  } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5754
  GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5755
  kind = cudaMemcpyDeviceToDevice;
5756
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5757
  int id;
5758
  CUDA_CHECK(cudaGetDevice(&id));
5759
  src_ptr = (char *) extra->data_device[id];
@@ -5789,107 +5735,6 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5789
  }
5790
  }
5791
 
5792
- static void ggml_cuda_op_repeat(
5793
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5794
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5795
- // guaranteed to be an integer due to the check in ggml_can_repeat
5796
- const int64_t ne0 = dst->ne[0];
5797
- const int64_t ne1 = dst->ne[1];
5798
- const int64_t ne2 = dst->ne[2];
5799
- const int64_t ne3 = dst->ne[3];
5800
-
5801
- const int64_t ne00 = src0->ne[0];
5802
- const int64_t ne01 = src0->ne[1];
5803
- const int64_t ne02 = src0->ne[2];
5804
- const int64_t ne03 = src0->ne[3];
5805
-
5806
- const size_t nb0 = dst->nb[0];
5807
- const size_t nb1 = dst->nb[1];
5808
- const size_t nb2 = dst->nb[2];
5809
- const size_t nb3 = dst->nb[3];
5810
-
5811
- const size_t nb00 = src0->nb[0];
5812
- const size_t nb01 = src0->nb[1];
5813
- const size_t nb02 = src0->nb[2];
5814
- const size_t nb03 = src0->nb[3];
5815
-
5816
- const int nr0 = (int)(ne0/ne00);
5817
- const int nr1 = (int)(ne1/ne01);
5818
- const int nr2 = (int)(ne2/ne02);
5819
- const int nr3 = (int)(ne3/ne03);
5820
-
5821
- // TODO: support for transposed / permuted tensors
5822
- GGML_ASSERT(nb0 == sizeof(float));
5823
- GGML_ASSERT(nb00 == sizeof(float));
5824
-
5825
- // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
5826
- for (int i3 = 0; i3 < nr3; i3++) {
5827
- for (int k3 = 0; k3 < ne03; k3++) {
5828
- for (int i2 = 0; i2 < nr2; i2++) {
5829
- for (int k2 = 0; k2 < ne02; k2++) {
5830
- for (int i1 = 0; i1 < nr1; i1++) {
5831
- for (int k1 = 0; k1 < ne01; k1++) {
5832
- for (int i0 = 0; i0 < nr0; i0++) {
5833
- CUDA_CHECK(cudaMemcpyAsync(
5834
- (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
5835
- (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
5836
- ne00*nb0, cudaMemcpyDeviceToDevice, stream));
5837
- }
5838
- }
5839
- }
5840
- }
5841
- }
5842
- }
5843
- }
5844
-
5845
- (void) src1;
5846
- (void) src1_d;
5847
- }
5848
-
5849
- static void ggml_cuda_op_get_rows(
5850
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5851
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
5852
-
5853
- GGML_ASSERT(src1->type == GGML_TYPE_I32);
5854
- GGML_ASSERT(dst->type == GGML_TYPE_F32);
5855
- GGML_ASSERT(ggml_is_contiguous(src0));
5856
- GGML_ASSERT(ggml_is_contiguous(src1));
5857
- GGML_ASSERT(ggml_is_contiguous(dst));
5858
-
5859
- const int ncols = src0->ne[0];
5860
- const int nrows = ggml_nelements(src1);
5861
-
5862
- const int32_t * src1_i32 = (const int32_t *) src1_d;
5863
-
5864
- switch (src0->type) {
5865
- case GGML_TYPE_F16:
5866
- get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5867
- break;
5868
- case GGML_TYPE_F32:
5869
- get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5870
- break;
5871
- case GGML_TYPE_Q4_0:
5872
- get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5873
- break;
5874
- case GGML_TYPE_Q4_1:
5875
- get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5876
- break;
5877
- case GGML_TYPE_Q5_0:
5878
- get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5879
- break;
5880
- case GGML_TYPE_Q5_1:
5881
- get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5882
- break;
5883
- case GGML_TYPE_Q8_0:
5884
- get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
5885
- break;
5886
- default:
5887
- // TODO: k-quants
5888
- GGML_ASSERT(false);
5889
- break;
5890
- }
5891
- }
5892
-
5893
  inline void ggml_cuda_op_add(
5894
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5895
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6430,12 +6275,12 @@ inline void ggml_cuda_op_alibi(
6430
  const int64_t ne02 = src0->ne[2];
6431
  const int64_t nrows = ggml_nrows(src0);
6432
 
6433
- //const int n_past = ((int32_t *) dst->op_params)[0];
6434
  const int n_head = ((int32_t *) dst->op_params)[1];
6435
  float max_bias;
6436
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
6437
 
6438
- //GGML_ASSERT(ne01 + n_past == ne00);
6439
  GGML_ASSERT(n_head == ne02);
6440
 
6441
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -6494,14 +6339,7 @@ inline void ggml_cuda_op_scale(
6494
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6495
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6496
 
6497
- float scale;
6498
- // HACK: support for ggml backend interface
6499
- if (src1->backend == GGML_BACKEND_CPU) {
6500
- scale = ((float *) src1->data)[0];
6501
- } else {
6502
- // TODO: pass pointer to kernel instead of copying to host
6503
- CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
6504
- }
6505
 
6506
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
6507
  CUDA_CHECK(cudaGetLastError());
@@ -6511,24 +6349,6 @@ inline void ggml_cuda_op_scale(
6511
  (void) src1_dd;
6512
  }
6513
 
6514
- inline void ggml_cuda_op_clamp(
6515
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6516
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6517
-
6518
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
6519
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
6520
-
6521
- const float min = ((float *) dst->op_params)[0];
6522
- const float max = ((float *) dst->op_params)[1];
6523
-
6524
- clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6525
- CUDA_CHECK(cudaGetLastError());
6526
-
6527
- (void) src1;
6528
- (void) dst;
6529
- (void) src1_dd;
6530
- }
6531
-
6532
  static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6533
  const int64_t nrows0 = ggml_nrows(src0);
6534
 
@@ -6538,9 +6358,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6538
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6539
  GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6540
 
6541
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6542
- ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6543
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6544
 
6545
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6546
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6681,9 +6501,9 @@ static void ggml_cuda_op_mul_mat(
6681
  const size_t q8_1_ts = sizeof(block_q8_1);
6682
  const size_t q8_1_bs = QK8_1;
6683
 
6684
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6685
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6686
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6687
 
6688
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6689
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6761,7 +6581,7 @@ static void ggml_cuda_op_mul_mat(
6761
  if (convert_src1_to_q8_1) {
6762
  src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6763
 
6764
- if (src1_on_device && src1_is_contiguous) {
6765
  quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6766
  CUDA_CHECK(cudaGetLastError());
6767
  }
@@ -6843,7 +6663,7 @@ static void ggml_cuda_op_mul_mat(
6843
  GGML_ASSERT(false);
6844
  }
6845
 
6846
- if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
6847
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6848
  CUDA_CHECK(cudaGetLastError());
6849
  }
@@ -6934,14 +6754,6 @@ static void ggml_cuda_op_mul_mat(
6934
  }
6935
  }
6936
 
6937
- static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6938
- ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
6939
- }
6940
-
6941
- static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6942
- ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
6943
- }
6944
-
6945
  static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6946
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6947
  }
@@ -6996,13 +6808,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
6996
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6997
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6998
 
6999
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7000
  void * src0_ddq = src0_extra->data_device[g_main_device];
7001
 
7002
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7003
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
7004
 
7005
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7006
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
7007
 
7008
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -7027,13 +6839,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7027
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7028
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7029
 
7030
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7031
  void * src0_ddq = src0_extra->data_device[g_main_device];
7032
 
7033
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7034
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
7035
 
7036
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
7037
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
7038
 
7039
  const int64_t row_stride_x = nb01 / sizeof(half);
@@ -7054,11 +6866,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
7054
  }
7055
  }
7056
 
7057
- if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
7058
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
7059
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
7060
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
7061
- } else if (src0->type == GGML_TYPE_F32) {
7062
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
7063
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
7064
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -7090,10 +6902,6 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
7090
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
7091
  }
7092
 
7093
- static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7094
- ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
7095
- }
7096
-
7097
  static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7098
  const int64_t ne = ggml_nelements(src0);
7099
  GGML_ASSERT(ne == ggml_nelements(src1));
@@ -7123,8 +6931,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
7123
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7124
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7125
 
7126
- const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
7127
- const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
7128
 
7129
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7130
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -7179,8 +6987,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7179
 
7180
  const size_t nb1 = tensor->nb[1];
7181
 
7182
- ggml_backend_type backend = tensor->backend;
7183
- ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
7184
  memset(extra, 0, sizeof(*extra));
7185
 
7186
  for (int64_t id = 0; id < g_device_count; ++id) {
@@ -7234,6 +7042,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
7234
  CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
7235
  }
7236
 
 
7237
  CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
7238
 
7239
  extra->data_device[id] = buf;
@@ -7272,17 +7081,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
7272
  delete extra;
7273
  }
7274
 
7275
- static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7276
  static size_t g_temp_tensor_extra_index = 0;
7277
 
7278
- static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7279
  if (g_temp_tensor_extras == nullptr) {
7280
  g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7281
  }
7282
 
7283
  size_t alloc_index = g_temp_tensor_extra_index;
7284
  g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7285
- ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7286
  memset(extra, 0, sizeof(*extra));
7287
 
7288
  return extra;
@@ -7310,7 +7119,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7310
  return;
7311
  }
7312
 
7313
- ggml_tensor_extra_gpu * extra;
7314
 
7315
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7316
  tensor->op == GGML_OP_VIEW ||
@@ -7319,7 +7128,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7319
 
7320
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7321
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7322
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7323
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7324
  size_t offset = 0;
7325
  if (tensor->op == GGML_OP_VIEW) {
@@ -7328,7 +7137,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
7328
  extra = ggml_cuda_alloc_temp_tensor_extra();
7329
  extra->data_device[g_main_device] = src0_ddc + offset;
7330
  } else if (tensor->op == GGML_OP_CPY) {
7331
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7332
  void * src1_ddv = src1_extra->data_device[g_main_device];
7333
  extra = ggml_cuda_alloc_temp_tensor_extra();
7334
  extra->data_device[g_main_device] = src1_ddv;
@@ -7370,13 +7179,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
7370
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
7371
  }
7372
 
7373
- ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7374
 
7375
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7376
  tensor->op == GGML_OP_VIEW;
7377
 
7378
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7379
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7380
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7381
  size_t view_offset = 0;
7382
  if (tensor->op == GGML_OP_VIEW) {
@@ -7394,7 +7203,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7394
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7395
  GGML_ASSERT(ggml_is_contiguous(tensor));
7396
 
7397
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7398
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7399
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7400
  }
@@ -7451,47 +7260,58 @@ void ggml_cuda_free_scratch() {
7451
  g_scratch_buffer = nullptr;
7452
  }
7453
 
7454
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
7455
  ggml_cuda_func_t func;
7456
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7457
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
7458
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
7459
 
7460
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
7461
- return false;
7462
- }
7463
-
7464
  switch (tensor->op) {
7465
- case GGML_OP_REPEAT:
7466
- func = ggml_cuda_repeat;
7467
- break;
7468
- case GGML_OP_GET_ROWS:
7469
- func = ggml_cuda_get_rows;
7470
- break;
7471
  case GGML_OP_DUP:
 
 
 
7472
  func = ggml_cuda_dup;
7473
  break;
7474
  case GGML_OP_ADD:
 
 
 
7475
  func = ggml_cuda_add;
7476
  break;
7477
  case GGML_OP_MUL:
 
 
 
7478
  func = ggml_cuda_mul;
7479
  break;
7480
  case GGML_OP_UNARY:
7481
  switch (ggml_get_unary_op(tensor)) {
7482
  case GGML_UNARY_OP_GELU:
 
 
 
7483
  func = ggml_cuda_gelu;
7484
  break;
7485
  case GGML_UNARY_OP_SILU:
 
 
 
7486
  func = ggml_cuda_silu;
7487
  break;
7488
  default:
7489
  return false;
7490
  } break;
7491
  case GGML_OP_NORM:
 
 
 
7492
  func = ggml_cuda_norm;
7493
  break;
7494
  case GGML_OP_RMS_NORM:
 
 
 
7495
  func = ggml_cuda_rms_norm;
7496
  break;
7497
  case GGML_OP_MUL_MAT:
@@ -7501,36 +7321,54 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
7501
  func = ggml_cuda_mul_mat;
7502
  break;
7503
  case GGML_OP_SCALE:
7504
- func = ggml_cuda_scale;
7505
- break;
7506
- case GGML_OP_CLAMP:
7507
  if (!any_on_device) {
7508
  return false;
7509
  }
7510
- func = ggml_cuda_clamp;
7511
  break;
7512
  case GGML_OP_CPY:
 
 
 
7513
  func = ggml_cuda_cpy;
7514
  break;
7515
  case GGML_OP_CONT:
 
 
 
7516
  func = ggml_cuda_dup;
7517
  break;
7518
  case GGML_OP_RESHAPE:
7519
  case GGML_OP_VIEW:
7520
  case GGML_OP_PERMUTE:
7521
  case GGML_OP_TRANSPOSE:
 
 
 
7522
  func = ggml_cuda_nop;
7523
  break;
7524
  case GGML_OP_DIAG_MASK_INF:
 
 
 
7525
  func = ggml_cuda_diag_mask_inf;
7526
  break;
7527
  case GGML_OP_SOFT_MAX:
 
 
 
7528
  func = ggml_cuda_soft_max;
7529
  break;
7530
  case GGML_OP_ROPE:
 
 
 
7531
  func = ggml_cuda_rope;
7532
  break;
7533
  case GGML_OP_ALIBI:
 
 
 
7534
  func = ggml_cuda_alibi;
7535
  break;
7536
  default:
@@ -7558,263 +7396,3 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
7558
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
7559
  snprintf(description, description_size, "%s", prop.name);
7560
  }
7561
-
7562
- ////////////////////////////////////////////////////////////////////////////////
7563
-
7564
- // backend interface
7565
-
7566
- #define UNUSED GGML_UNUSED
7567
-
7568
- struct ggml_backend_context_cuda {
7569
- };
7570
-
7571
- static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
7572
- return GGML_CUDA_NAME;
7573
-
7574
- UNUSED(backend);
7575
- }
7576
-
7577
- static void ggml_backend_cuda_free(ggml_backend_t backend) {
7578
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
7579
- delete cuda_ctx;
7580
- delete backend;
7581
- }
7582
-
7583
- struct ggml_backend_buffer_context_cuda {
7584
- void * device;
7585
-
7586
- ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
7587
- size_t temp_tensor_extra_index = 0;
7588
-
7589
- ~ggml_backend_buffer_context_cuda() {
7590
- delete[] temp_tensor_extras;
7591
- }
7592
-
7593
- ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7594
- if (temp_tensor_extras == nullptr) {
7595
- temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7596
- }
7597
-
7598
- size_t alloc_index = temp_tensor_extra_index;
7599
- temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7600
- ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
7601
- memset(extra, 0, sizeof(*extra));
7602
-
7603
- return extra;
7604
- }
7605
- };
7606
-
7607
- static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
7608
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7609
- CUDA_CHECK(cudaFree(ctx->device));
7610
- delete ctx;
7611
- }
7612
-
7613
- static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
7614
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7615
- return ctx->device;
7616
- }
7617
-
7618
- static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7619
- int64_t row_low = 0;
7620
- int64_t row_high = ggml_nrows(tensor);
7621
- int64_t nrows_split = row_high - row_low;
7622
-
7623
- size_t size = ggml_nbytes_split(tensor, nrows_split);
7624
-
7625
- int64_t ne0 = tensor->ne[0];
7626
-
7627
- if (ggml_is_quantized(tensor->type)) {
7628
- if (ne0 % MATRIX_ROW_PADDING != 0) {
7629
- size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
7630
- * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
7631
- }
7632
- }
7633
-
7634
- return size;
7635
-
7636
- UNUSED(buffer);
7637
- }
7638
-
7639
- static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
7640
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
7641
-
7642
- if (tensor->view_src != NULL && tensor->view_offs == 0) {
7643
- assert(tensor->view_src->buffer->backend == buffer->backend);
7644
- tensor->backend = tensor->view_src->backend;
7645
- tensor->extra = tensor->view_src->extra;
7646
- return;
7647
- }
7648
-
7649
- ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
7650
-
7651
- extra->data_device[g_main_device] = tensor->data;
7652
-
7653
- tensor->backend = GGML_BACKEND_GPU;
7654
- tensor->extra = extra;
7655
-
7656
- if (ggml_is_quantized(tensor->type)) {
7657
- // initialize padding to 0 to avoid possible NaN values
7658
- int64_t row_low = 0;
7659
- int64_t row_high = ggml_nrows(tensor);
7660
- int64_t nrows_split = row_high - row_low;
7661
-
7662
- size_t original_size = ggml_nbytes_split(tensor, nrows_split);
7663
- size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
7664
-
7665
- if (padded_size > original_size && tensor->view_src == nullptr) {
7666
- CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
7667
- }
7668
- }
7669
-
7670
- UNUSED(buffer);
7671
- }
7672
-
7673
- static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
7674
- /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
7675
- /* .get_base = */ ggml_backend_cuda_buffer_get_base,
7676
- /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
7677
- /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
7678
- /* .free_tensor = */ NULL,
7679
- };
7680
-
7681
- static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
7682
- ggml_cuda_set_device(g_main_device);
7683
-
7684
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
7685
- CUDA_CHECK(cudaMalloc(&ctx->device, size));
7686
- return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
7687
- }
7688
-
7689
- static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
7690
- return 128;
7691
- UNUSED(backend);
7692
- }
7693
-
7694
- static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
7695
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
7696
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7697
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7698
-
7699
- CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
7700
-
7701
- UNUSED(backend);
7702
- }
7703
-
7704
- static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
7705
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
7706
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
7707
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7708
-
7709
- CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
7710
-
7711
- UNUSED(backend);
7712
- }
7713
-
7714
- static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
7715
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
7716
-
7717
- UNUSED(backend);
7718
- }
7719
-
7720
- static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
7721
- GGML_ASSERT(!"not implemented");
7722
-
7723
- return nullptr;
7724
-
7725
- UNUSED(backend);
7726
- UNUSED(cgraph);
7727
- }
7728
-
7729
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7730
- GGML_ASSERT(!"not implemented");
7731
-
7732
- UNUSED(backend);
7733
- UNUSED(plan);
7734
- }
7735
-
7736
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
7737
- GGML_ASSERT(!"not implemented");
7738
-
7739
- UNUSED(backend);
7740
- UNUSED(plan);
7741
- }
7742
-
7743
- static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
7744
- ggml_cuda_set_device(g_main_device);
7745
-
7746
- ggml_compute_params params = {};
7747
- params.type = GGML_TASK_COMPUTE;
7748
- params.ith = 0;
7749
- for (int i = 0; i < cgraph->n_nodes; i++) {
7750
- ggml_tensor * node = cgraph->nodes[i];
7751
-
7752
- assert(node->backend == GGML_BACKEND_GPU);
7753
- for (int j = 0; j < GGML_MAX_SRC; j++) {
7754
- if (node->src[j] != nullptr) {
7755
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
7756
- }
7757
- }
7758
-
7759
- bool ok = ggml_cuda_compute_forward(&params, node);
7760
- if (!ok) {
7761
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
7762
- }
7763
- GGML_ASSERT(ok);
7764
-
7765
- #if 0
7766
- if (node->type == GGML_TYPE_F32) {
7767
- cudaDeviceSynchronize();
7768
- std::vector<float> tmp(ggml_nelements(node), 0.0f);
7769
- cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
7770
- printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
7771
- ggml_type_name(node->src[0]->type),
7772
- node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
7773
- node->src[0]->name,
7774
- node->src[1] ? node->src[1]->name : "none");
7775
- double sum = 0.0;
7776
- double sq_sum = 0.0;
7777
- for (int i = 0; i < ggml_nelements(node); i++) {
7778
- printf("%f ", tmp[i]);
7779
- sum += tmp[i];
7780
- sq_sum += tmp[i]*tmp[i];
7781
- }
7782
- printf("\n");
7783
- printf("sum: %f, ", sum);
7784
- printf("sq_sum: %f\n", sq_sum);
7785
- }
7786
- #endif
7787
- }
7788
-
7789
- UNUSED(backend);
7790
- }
7791
-
7792
- static ggml_backend_i cuda_backend_i = {
7793
- /* .get_name = */ ggml_backend_cuda_name,
7794
- /* .free = */ ggml_backend_cuda_free,
7795
- /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
7796
- /* .get_alignment = */ ggml_backend_cuda_get_alignment,
7797
- /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
7798
- /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
7799
- /* .synchronize = */ ggml_backend_cuda_synchronize,
7800
- /* .cpy_tensor_from = */ nullptr,
7801
- /* .cpy_tensor_to = */ nullptr,
7802
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
7803
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
7804
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
7805
- /* .graph_compute = */ ggml_backend_cuda_graph_compute,
7806
- /* .supports_op = */ nullptr,
7807
- };
7808
-
7809
- ggml_backend_t ggml_backend_cuda_init() {
7810
- ggml_init_cublas(); // TODO: remove from ggml.c
7811
-
7812
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
7813
-
7814
- ggml_backend_t cuda_backend = new ggml_backend {
7815
- /* .interface = */ cuda_backend_i,
7816
- /* .context = */ ctx
7817
- };
7818
-
7819
- return cuda_backend;
7820
- }
 
62
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
63
  #define cudaMemcpyKind hipMemcpyKind
64
  #define cudaMemset hipMemset
 
65
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
66
  #define cudaSetDevice hipSetDevice
67
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 
414
  #define CUDA_SILU_BLOCK_SIZE 256
415
  #define CUDA_CPY_BLOCK_SIZE 32
416
  #define CUDA_SCALE_BLOCK_SIZE 256
 
417
  #define CUDA_ROPE_BLOCK_SIZE 256
418
  #define CUDA_ALIBI_BLOCK_SIZE 32
419
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
420
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
421
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 
422
 
423
  // dmmv = dequantize_mul_mat_vec
424
  #ifndef GGML_CUDA_DMMV_X
 
1574
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1575
  }
1576
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1577
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1578
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1579
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
 
4555
  dst[i] = scale * x[i];
4556
  }
4557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4558
  static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
4559
  const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
4560
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 
5436
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5437
  }
5438
 
 
 
 
 
 
5439
  template<typename T>
5440
  static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5441
  const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
 
5699
  } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5700
  GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5701
  kind = cudaMemcpyDeviceToDevice;
5702
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5703
  int id;
5704
  CUDA_CHECK(cudaGetDevice(&id));
5705
  src_ptr = (char *) extra->data_device[id];
 
5735
  }
5736
  }
5737
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5738
  inline void ggml_cuda_op_add(
5739
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5740
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
 
6275
  const int64_t ne02 = src0->ne[2];
6276
  const int64_t nrows = ggml_nrows(src0);
6277
 
6278
+ const int n_past = ((int32_t *) dst->op_params)[0];
6279
  const int n_head = ((int32_t *) dst->op_params)[1];
6280
  float max_bias;
6281
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
6282
 
6283
+ GGML_ASSERT(ne01 + n_past == ne00);
6284
  GGML_ASSERT(n_head == ne02);
6285
 
6286
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
 
6339
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6340
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
6341
 
6342
+ const float scale = ((float *) src1->data)[0];
 
 
 
 
 
 
 
6343
 
6344
  scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
6345
  CUDA_CHECK(cudaGetLastError());
 
6349
  (void) src1_dd;
6350
  }
6351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6352
  static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6353
  const int64_t nrows0 = ggml_nrows(src0);
6354
 
 
6358
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6359
  GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6360
 
6361
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6362
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6363
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6364
 
6365
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6366
  const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
 
6501
  const size_t q8_1_ts = sizeof(block_q8_1);
6502
  const size_t q8_1_bs = QK8_1;
6503
 
6504
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6505
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6506
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6507
 
6508
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6509
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
 
6581
  if (convert_src1_to_q8_1) {
6582
  src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6583
 
6584
+ if (split && src1_on_device && src1_is_contiguous) {
6585
  quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6586
  CUDA_CHECK(cudaGetLastError());
6587
  }
 
6663
  GGML_ASSERT(false);
6664
  }
6665
 
6666
+ if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6667
  quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6668
  CUDA_CHECK(cudaGetLastError());
6669
  }
 
6754
  }
6755
  }
6756
 
 
 
 
 
 
 
 
 
6757
  static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6758
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6759
  }
 
6808
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6809
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6810
 
6811
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6812
  void * src0_ddq = src0_extra->data_device[g_main_device];
6813
 
6814
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6815
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6816
 
6817
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6818
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6819
 
6820
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 
6839
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6840
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6841
 
6842
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6843
  void * src0_ddq = src0_extra->data_device[g_main_device];
6844
 
6845
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6846
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
6847
 
6848
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6849
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6850
 
6851
  const int64_t row_stride_x = nb01 / sizeof(half);
 
6866
  }
6867
  }
6868
 
6869
+ if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6870
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6871
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6872
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6873
+ }else if (src0->type == GGML_TYPE_F32) {
6874
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6875
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6876
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
 
6902
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6903
  }
6904
 
 
 
 
 
6905
  static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6906
  const int64_t ne = ggml_nelements(src0);
6907
  GGML_ASSERT(ne == ggml_nelements(src1));
 
6931
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6932
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6933
 
6934
+ const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6935
+ const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6936
 
6937
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6938
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
 
6987
 
6988
  const size_t nb1 = tensor->nb[1];
6989
 
6990
+ ggml_backend backend = tensor->backend;
6991
+ struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6992
  memset(extra, 0, sizeof(*extra));
6993
 
6994
  for (int64_t id = 0; id < g_device_count; ++id) {
 
7042
  CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
7043
  }
7044
 
7045
+
7046
  CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
7047
 
7048
  extra->data_device[id] = buf;
 
7081
  delete extra;
7082
  }
7083
 
7084
+ static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
7085
  static size_t g_temp_tensor_extra_index = 0;
7086
 
7087
+ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
7088
  if (g_temp_tensor_extras == nullptr) {
7089
  g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
7090
  }
7091
 
7092
  size_t alloc_index = g_temp_tensor_extra_index;
7093
  g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
7094
+ struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
7095
  memset(extra, 0, sizeof(*extra));
7096
 
7097
  return extra;
 
7119
  return;
7120
  }
7121
 
7122
+ struct ggml_tensor_extra_gpu * extra;
7123
 
7124
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7125
  tensor->op == GGML_OP_VIEW ||
 
7128
 
7129
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7130
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7131
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7132
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7133
  size_t offset = 0;
7134
  if (tensor->op == GGML_OP_VIEW) {
 
7137
  extra = ggml_cuda_alloc_temp_tensor_extra();
7138
  extra->data_device[g_main_device] = src0_ddc + offset;
7139
  } else if (tensor->op == GGML_OP_CPY) {
7140
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
7141
  void * src1_ddv = src1_extra->data_device[g_main_device];
7142
  extra = ggml_cuda_alloc_temp_tensor_extra();
7143
  extra->data_device[g_main_device] = src1_ddv;
 
7179
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
7180
  }
7181
 
7182
+ struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
7183
 
7184
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
7185
  tensor->op == GGML_OP_VIEW;
7186
 
7187
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
7188
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
7189
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
7190
  size_t view_offset = 0;
7191
  if (tensor->op == GGML_OP_VIEW) {
 
7203
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7204
  GGML_ASSERT(ggml_is_contiguous(tensor));
7205
 
7206
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7207
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7208
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7209
  }
 
7260
  g_scratch_buffer = nullptr;
7261
  }
7262
 
7263
+ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
7264
  ggml_cuda_func_t func;
7265
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
7266
  || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
7267
  || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
7268
 
 
 
 
 
7269
  switch (tensor->op) {
 
 
 
 
 
 
7270
  case GGML_OP_DUP:
7271
+ if (!any_on_device) {
7272
+ return false;
7273
+ }
7274
  func = ggml_cuda_dup;
7275
  break;
7276
  case GGML_OP_ADD:
7277
+ if (!any_on_device) {
7278
+ return false;
7279
+ }
7280
  func = ggml_cuda_add;
7281
  break;
7282
  case GGML_OP_MUL:
7283
+ if (!any_on_device) {
7284
+ return false;
7285
+ }
7286
  func = ggml_cuda_mul;
7287
  break;
7288
  case GGML_OP_UNARY:
7289
  switch (ggml_get_unary_op(tensor)) {
7290
  case GGML_UNARY_OP_GELU:
7291
+ if (!any_on_device) {
7292
+ return false;
7293
+ }
7294
  func = ggml_cuda_gelu;
7295
  break;
7296
  case GGML_UNARY_OP_SILU:
7297
+ if (!any_on_device) {
7298
+ return false;
7299
+ }
7300
  func = ggml_cuda_silu;
7301
  break;
7302
  default:
7303
  return false;
7304
  } break;
7305
  case GGML_OP_NORM:
7306
+ if (!any_on_device) {
7307
+ return false;
7308
+ }
7309
  func = ggml_cuda_norm;
7310
  break;
7311
  case GGML_OP_RMS_NORM:
7312
+ if (!any_on_device) {
7313
+ return false;
7314
+ }
7315
  func = ggml_cuda_rms_norm;
7316
  break;
7317
  case GGML_OP_MUL_MAT:
 
7321
  func = ggml_cuda_mul_mat;
7322
  break;
7323
  case GGML_OP_SCALE:
 
 
 
7324
  if (!any_on_device) {
7325
  return false;
7326
  }
7327
+ func = ggml_cuda_scale;
7328
  break;
7329
  case GGML_OP_CPY:
7330
+ if (!any_on_device) {
7331
+ return false;
7332
+ }
7333
  func = ggml_cuda_cpy;
7334
  break;
7335
  case GGML_OP_CONT:
7336
+ if (!any_on_device) {
7337
+ return false;
7338
+ }
7339
  func = ggml_cuda_dup;
7340
  break;
7341
  case GGML_OP_RESHAPE:
7342
  case GGML_OP_VIEW:
7343
  case GGML_OP_PERMUTE:
7344
  case GGML_OP_TRANSPOSE:
7345
+ if (!any_on_device) {
7346
+ return false;
7347
+ }
7348
  func = ggml_cuda_nop;
7349
  break;
7350
  case GGML_OP_DIAG_MASK_INF:
7351
+ if (!any_on_device) {
7352
+ return false;
7353
+ }
7354
  func = ggml_cuda_diag_mask_inf;
7355
  break;
7356
  case GGML_OP_SOFT_MAX:
7357
+ if (!any_on_device) {
7358
+ return false;
7359
+ }
7360
  func = ggml_cuda_soft_max;
7361
  break;
7362
  case GGML_OP_ROPE:
7363
+ if (!any_on_device) {
7364
+ return false;
7365
+ }
7366
  func = ggml_cuda_rope;
7367
  break;
7368
  case GGML_OP_ALIBI:
7369
+ if (!any_on_device) {
7370
+ return false;
7371
+ }
7372
  func = ggml_cuda_alibi;
7373
  break;
7374
  default:
 
7396
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
7397
  snprintf(description, description_size, "%s", prop.name);
7398
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml-cuda.h CHANGED
@@ -1,7 +1,6 @@
1
  #pragma once
2
 
3
  #include "ggml.h"
4
- #include "ggml-backend.h"
5
 
6
  #ifdef GGML_USE_HIPBLAS
7
  #define GGML_CUDA_NAME "ROCm"
@@ -43,9 +42,6 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s
43
  GGML_API int ggml_cuda_get_device_count(void);
44
  GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
45
 
46
- // backend API
47
- GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
48
-
49
  #ifdef __cplusplus
50
  }
51
  #endif
 
1
  #pragma once
2
 
3
  #include "ggml.h"
 
4
 
5
  #ifdef GGML_USE_HIPBLAS
6
  #define GGML_CUDA_NAME "ROCm"
 
42
  GGML_API int ggml_cuda_get_device_count(void);
43
  GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
44
 
 
 
 
45
  #ifdef __cplusplus
46
  }
47
  #endif
ggml-metal.h CHANGED
@@ -20,7 +20,6 @@
20
  #pragma once
21
 
22
  #include "ggml.h"
23
- #include "ggml-backend.h"
24
 
25
  #include <stddef.h>
26
  #include <stdbool.h>
@@ -36,15 +35,10 @@ struct ggml_cgraph;
36
  extern "C" {
37
  #endif
38
 
39
- //
40
- // internal API
41
- // temporary exposed to user-code
42
- //
43
 
44
  struct ggml_metal_context;
45
 
46
- void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
47
-
48
  // number of command buffers to use
49
  struct ggml_metal_context * ggml_metal_init(int n_cb);
50
  void ggml_metal_free(struct ggml_metal_context * ctx);
@@ -89,17 +83,6 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
89
  // creates gf->n_threads command buffers in parallel
90
  void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
91
 
92
- //
93
- // backend API
94
- // user-code should use only these functions
95
- //
96
-
97
- GGML_API ggml_backend_t ggml_backend_metal_init(void);
98
-
99
- GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
100
-
101
- GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
102
-
103
  #ifdef __cplusplus
104
  }
105
  #endif
 
20
  #pragma once
21
 
22
  #include "ggml.h"
 
23
 
24
  #include <stddef.h>
25
  #include <stdbool.h>
 
35
  extern "C" {
36
  #endif
37
 
38
+ void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 
 
 
39
 
40
  struct ggml_metal_context;
41
 
 
 
42
  // number of command buffers to use
43
  struct ggml_metal_context * ggml_metal_init(int n_cb);
44
  void ggml_metal_free(struct ggml_metal_context * ctx);
 
83
  // creates gf->n_threads command buffers in parallel
84
  void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
85
 
 
 
 
 
 
 
 
 
 
 
 
86
  #ifdef __cplusplus
87
  }
88
  #endif
ggml-metal.m CHANGED
@@ -779,8 +779,8 @@ void ggml_metal_graph_compute(
779
  } break;
780
  case GGML_OP_CONCAT:
781
  {
782
- const int64_t nb = ne00;
783
 
 
784
  [encoder setComputePipelineState:ctx->pipeline_concat];
785
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
786
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@@ -812,7 +812,6 @@ void ggml_metal_graph_compute(
812
  [encoder setBytes:&nb length:sizeof(nb) atIndex:27];
813
 
814
  const int nth = MIN(1024, ne0);
815
-
816
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
817
  } break;
818
  case GGML_OP_ADD:
@@ -910,10 +909,9 @@ void ggml_metal_graph_compute(
910
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
911
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
912
 
913
- const int64_t n = ggml_nelements(dst);
914
- GGML_ASSERT(n % 4 == 0);
915
 
916
- [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
917
  } break;
918
  case GGML_OP_UNARY:
919
  switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -923,10 +921,9 @@ void ggml_metal_graph_compute(
923
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
924
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
925
 
926
- const int64_t n = ggml_nelements(dst);
927
- GGML_ASSERT(n % 4 == 0);
928
 
929
- [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
930
  } break;
931
  case GGML_UNARY_OP_RELU:
932
  {
@@ -944,10 +941,9 @@ void ggml_metal_graph_compute(
944
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
945
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
946
 
947
- const int64_t n = ggml_nelements(dst);
948
- GGML_ASSERT(n % 4 == 0);
949
 
950
- [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
951
  } break;
952
  default:
953
  {
@@ -1044,7 +1040,7 @@ void ggml_metal_graph_compute(
1044
  !ggml_is_transposed(src0) &&
1045
  !ggml_is_transposed(src1) &&
1046
  src1t == GGML_TYPE_F32 &&
1047
- ne00 % 32 == 0 && ne00 >= 64 &&
1048
  ne11 > ne11_mm_min) {
1049
  //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
1050
  switch (src0->type) {
@@ -1255,8 +1251,6 @@ void ggml_metal_graph_compute(
1255
  } break;
1256
  case GGML_OP_RMS_NORM:
1257
  {
1258
- GGML_ASSERT(ne00 % 4 == 0);
1259
-
1260
  float eps;
1261
  memcpy(&eps, dst->op_params, sizeof(float));
1262
 
@@ -1299,7 +1293,7 @@ void ggml_metal_graph_compute(
1299
 
1300
  const int nth = MIN(1024, ne00);
1301
 
1302
- //const int n_past = ((int32_t *) dst->op_params)[0];
1303
  const int n_head = ((int32_t *) dst->op_params)[1];
1304
  float max_bias;
1305
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
@@ -1477,140 +1471,3 @@ preferably one under the recommended max working set size, or else fall back to
1477
 
1478
  }
1479
  }
1480
-
1481
- ////////////////////////////////////////////////////////////////////////////////
1482
-
1483
- // backend interface
1484
-
1485
- static const char * ggml_backend_metal_name(ggml_backend_t backend) {
1486
- return "Metal";
1487
-
1488
- UNUSED(backend);
1489
- }
1490
-
1491
- static void ggml_backend_metal_free(ggml_backend_t backend) {
1492
- struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
1493
- ggml_metal_free(ctx);
1494
- free(backend);
1495
- }
1496
-
1497
- static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
1498
- return (void *)buffer->context;
1499
- }
1500
-
1501
- static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1502
- free(buffer->context);
1503
- UNUSED(buffer);
1504
- }
1505
-
1506
- static struct ggml_backend_buffer_i metal_backend_buffer_i = {
1507
- /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
1508
- /* .get_base = */ ggml_backend_metal_buffer_get_base,
1509
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1510
- /* .init_tensor = */ NULL, // no initialization required
1511
- /* .free_tensor = */ NULL, // no cleanup required
1512
- };
1513
-
1514
- static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
1515
- struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
1516
-
1517
- void * data = ggml_metal_host_malloc(size);
1518
-
1519
- // TODO: set proper name of the buffers
1520
- ggml_metal_add_buffer(ctx, "backend", data, size, 0);
1521
-
1522
- return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
1523
- }
1524
-
1525
- static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
1526
- return 32;
1527
- UNUSED(backend);
1528
- }
1529
-
1530
- static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1531
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
1532
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
1533
-
1534
- memcpy((char *)tensor->data + offset, data, size);
1535
-
1536
- UNUSED(backend);
1537
- }
1538
-
1539
- static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1540
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
1541
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
1542
-
1543
- memcpy(data, (const char *)tensor->data + offset, size);
1544
-
1545
- UNUSED(backend);
1546
- }
1547
-
1548
- static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
1549
- UNUSED(backend);
1550
- }
1551
-
1552
- static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
1553
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
1554
-
1555
- UNUSED(backend);
1556
- }
1557
-
1558
- static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
1559
- ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
1560
-
1561
- UNUSED(backend);
1562
- }
1563
-
1564
- static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1565
- struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
1566
-
1567
- ggml_metal_graph_compute(metal_ctx, cgraph);
1568
- }
1569
-
1570
- static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
1571
- return true;
1572
- UNUSED(backend);
1573
- UNUSED(op);
1574
- }
1575
-
1576
- static struct ggml_backend_i metal_backend_i = {
1577
- /* .get_name = */ ggml_backend_metal_name,
1578
- /* .free = */ ggml_backend_metal_free,
1579
- /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
1580
- /* .get_alignment = */ ggml_backend_metal_get_alignment,
1581
- /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
1582
- /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
1583
- /* .synchronize = */ ggml_backend_metal_synchronize,
1584
- /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from,
1585
- /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to,
1586
- /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
1587
- /* .graph_plan_free = */ NULL,
1588
- /* .graph_plan_compute = */ NULL,
1589
- /* .graph_compute = */ ggml_backend_metal_graph_compute,
1590
- /* .supports_op = */ ggml_backend_metal_supports_op,
1591
- };
1592
-
1593
- ggml_backend_t ggml_backend_metal_init(void) {
1594
- struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
1595
-
1596
- ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
1597
-
1598
- ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
1599
-
1600
- *metal_backend = (struct ggml_backend) {
1601
- /* .interface = */ metal_backend_i,
1602
- /* .context = */ ctx,
1603
- };
1604
-
1605
- return metal_backend;
1606
- }
1607
-
1608
- bool ggml_backend_is_metal(ggml_backend_t backend) {
1609
- return backend->iface.get_name == ggml_backend_metal_name;
1610
- }
1611
-
1612
- void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
1613
- struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
1614
-
1615
- ggml_metal_set_n_cb(ctx, n_cb);
1616
- }
 
779
  } break;
780
  case GGML_OP_CONCAT:
781
  {
 
782
 
783
+ int64_t nb = ne00;
784
  [encoder setComputePipelineState:ctx->pipeline_concat];
785
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
786
  [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
 
812
  [encoder setBytes:&nb length:sizeof(nb) atIndex:27];
813
 
814
  const int nth = MIN(1024, ne0);
 
815
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
816
  } break;
817
  case GGML_OP_ADD:
 
909
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
910
  [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
911
 
912
+ const int64_t n = ggml_nelements(dst)/4;
 
913
 
914
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
915
  } break;
916
  case GGML_OP_UNARY:
917
  switch (ggml_get_unary_op(gf->nodes[i])) {
 
921
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
922
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
923
 
924
+ const int64_t n = ggml_nelements(dst)/4;
 
925
 
926
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
927
  } break;
928
  case GGML_UNARY_OP_RELU:
929
  {
 
941
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
942
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
943
 
944
+ const int64_t n = ggml_nelements(dst)/4;
 
945
 
946
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
947
  } break;
948
  default:
949
  {
 
1040
  !ggml_is_transposed(src0) &&
1041
  !ggml_is_transposed(src1) &&
1042
  src1t == GGML_TYPE_F32 &&
1043
+ ne00 % 32 == 0 &&
1044
  ne11 > ne11_mm_min) {
1045
  //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
1046
  switch (src0->type) {
 
1251
  } break;
1252
  case GGML_OP_RMS_NORM:
1253
  {
 
 
1254
  float eps;
1255
  memcpy(&eps, dst->op_params, sizeof(float));
1256
 
 
1293
 
1294
  const int nth = MIN(1024, ne00);
1295
 
1296
+ const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
1297
  const int n_head = ((int32_t *) dst->op_params)[1];
1298
  float max_bias;
1299
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
 
1471
 
1472
  }
1473
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml-metal.metal CHANGED
@@ -345,11 +345,10 @@ kernel void kernel_rms_norm(
345
  uint sgitg[[simdgroup_index_in_threadgroup]],
346
  uint tiisg[[thread_index_in_simdgroup]],
347
  uint ntg[[threads_per_threadgroup]]) {
348
- device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
349
- device const float * x_scalar = (device const float *) x;
350
-
351
- float4 sumf = 0;
352
- float all_sum = 0;
353
 
354
  // parallel sum
355
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
@@ -362,7 +361,6 @@ kernel void kernel_rms_norm(
362
  }
363
 
364
  threadgroup_barrier(mem_flags::mem_threadgroup);
365
-
366
  // broadcast, simd group number is ntg / 32
367
  for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
368
  if (tpitg < i) {
@@ -370,9 +368,7 @@ kernel void kernel_rms_norm(
370
  }
371
  }
372
  if (tpitg == 0) {
373
- for (int i = 4 * (ne00 / 4); i < ne00; i++) {
374
- sum[0] += x_scalar[i];
375
- }
376
  sum[0] /= ne00;
377
  }
378
 
@@ -387,9 +383,7 @@ kernel void kernel_rms_norm(
387
  y[i00] = x[i00] * scale;
388
  }
389
  if (tpitg == 0) {
390
- for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
391
- y_scalar[i00] = x_scalar[i00] * scale;
392
- }
393
  }
394
  }
395
 
 
345
  uint sgitg[[simdgroup_index_in_threadgroup]],
346
  uint tiisg[[thread_index_in_simdgroup]],
347
  uint ntg[[threads_per_threadgroup]]) {
348
+ device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
349
+ device const float * x_scalar = (device const float *) x;
350
+ float4 sumf=0;
351
+ float all_sum=0;
 
352
 
353
  // parallel sum
354
  for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
 
361
  }
362
 
363
  threadgroup_barrier(mem_flags::mem_threadgroup);
 
364
  // broadcast, simd group number is ntg / 32
365
  for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
366
  if (tpitg < i) {
 
368
  }
369
  }
370
  if (tpitg == 0) {
371
+ for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
 
 
372
  sum[0] /= ne00;
373
  }
374
 
 
383
  y[i00] = x[i00] * scale;
384
  }
385
  if (tpitg == 0) {
386
+ for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
 
 
387
  }
388
  }
389
 
ggml.c CHANGED
@@ -162,16 +162,40 @@ typedef void * thread_ret_t;
162
 
163
  #define GGML_PRINT(...) printf(__VA_ARGS__)
164
 
165
- //
166
- // end of logging block
167
- //
168
-
169
  #ifdef GGML_USE_ACCELERATE
170
  // uncomment to use vDSP for soft max computation
171
  // note: not sure if it is actually faster
172
  //#define GGML_SOFT_MAX_ACCELERATE
173
  #endif
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  #if defined(_MSC_VER) || defined(__MINGW32__)
176
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
177
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -4928,7 +4952,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4928
  *result = (struct ggml_tensor) {
4929
  /*.type =*/ type,
4930
  /*.backend =*/ GGML_BACKEND_CPU,
4931
- /*.buffer =*/ NULL,
4932
  /*.n_dims =*/ n_dims,
4933
  /*.ne =*/ { 1, 1, 1, 1 },
4934
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -11234,7 +11257,7 @@ static void ggml_compute_forward_silu_f32(
11234
 
11235
  #ifndef NDEBUG
11236
  for (int k = 0; k < nc; k++) {
11237
- const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
11238
  UNUSED(x);
11239
  assert(!isnan(x));
11240
  assert(!isinf(x));
@@ -13060,22 +13083,24 @@ static void ggml_compute_forward_alibi_f32(
13060
  return;
13061
  }
13062
 
13063
- //const int n_past = ((int32_t *) dst->op_params)[0];
13064
  const int n_head = ((int32_t *) dst->op_params)[1];
13065
  float max_bias;
13066
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13067
 
13068
- const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13069
- const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13070
- const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13071
- //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
 
 
13072
 
13073
- const int64_t n = ggml_nrows(src0);
13074
- const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13075
 
13076
- const size_t nb0 = src0->nb[0];
13077
- const size_t nb1 = src0->nb[1];
13078
- const size_t nb2 = src0->nb[2];
13079
  //const int nb3 = src0->nb[3];
13080
 
13081
  GGML_ASSERT(nb0 == sizeof(float));
@@ -13087,9 +13112,9 @@ static void ggml_compute_forward_alibi_f32(
13087
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13088
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13089
 
13090
- for (int64_t i = 0; i < ne0; i++) {
13091
- for (int64_t j = 0; j < ne1; j++) {
13092
- for (int64_t k = 0; k < ne2_ne3; k++) {
13093
  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13094
  float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13095
 
@@ -13104,6 +13129,7 @@ static void ggml_compute_forward_alibi_f32(
13104
  }
13105
 
13106
  pdst[0] = i * m_k + src[0];
 
13107
  }
13108
  }
13109
  }
@@ -20174,10 +20200,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
20174
  ggml_vec_cpy_f32(nx, xp, x);
20175
  ggml_vec_cpy_f32(nx, gp, g);
20176
 
20177
- // TODO: instead of passing &cancel here, use the return code of the linesearch
20178
- // to determine if the optimization should be cancelled
20179
- // this is a simple change, but not doing this atm, since I don't have a nice
20180
- // way to test and don't want to break something with so many changes lined up
20181
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
20182
  if (cancel) {
20183
  return GGML_OPT_CANCEL;
 
162
 
163
  #define GGML_PRINT(...) printf(__VA_ARGS__)
164
 
 
 
 
 
165
  #ifdef GGML_USE_ACCELERATE
166
  // uncomment to use vDSP for soft max computation
167
  // note: not sure if it is actually faster
168
  //#define GGML_SOFT_MAX_ACCELERATE
169
  #endif
170
 
171
+ //
172
+ // logging
173
+ //
174
+
175
+ #if (GGML_DEBUG >= 1)
176
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
177
+ #else
178
+ #define GGML_PRINT_DEBUG(...)
179
+ #endif
180
+
181
+ #if (GGML_DEBUG >= 5)
182
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
183
+ #else
184
+ #define GGML_PRINT_DEBUG_5(...)
185
+ #endif
186
+
187
+ #if (GGML_DEBUG >= 10)
188
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
189
+ #else
190
+ #define GGML_PRINT_DEBUG_10(...)
191
+ #endif
192
+
193
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
194
+
195
+ //
196
+ // end of logging block
197
+ //
198
+
199
  #if defined(_MSC_VER) || defined(__MINGW32__)
200
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
201
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
 
4952
  *result = (struct ggml_tensor) {
4953
  /*.type =*/ type,
4954
  /*.backend =*/ GGML_BACKEND_CPU,
 
4955
  /*.n_dims =*/ n_dims,
4956
  /*.ne =*/ { 1, 1, 1, 1 },
4957
  /*.nb =*/ { 0, 0, 0, 0 },
 
11257
 
11258
  #ifndef NDEBUG
11259
  for (int k = 0; k < nc; k++) {
11260
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
11261
  UNUSED(x);
11262
  assert(!isnan(x));
11263
  assert(!isinf(x));
 
13083
  return;
13084
  }
13085
 
13086
+ const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
13087
  const int n_head = ((int32_t *) dst->op_params)[1];
13088
  float max_bias;
13089
  memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13090
 
13091
+ assert(n_past >= 0);
13092
+
13093
+ const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13094
+ const int ne1 = src0->ne[1]; // seq_len_without_past
13095
+ const int ne2 = src0->ne[2]; // n_head -> this is k
13096
+ //const int ne3 = src0->ne[3]; // 1 -> bsz
13097
 
13098
+ const int n = ggml_nrows(src0);
13099
+ const int ne2_ne3 = n/ne1; // ne2*ne3
13100
 
13101
+ const int nb0 = src0->nb[0];
13102
+ const int nb1 = src0->nb[1];
13103
+ const int nb2 = src0->nb[2];
13104
  //const int nb3 = src0->nb[3];
13105
 
13106
  GGML_ASSERT(nb0 == sizeof(float));
 
13112
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13113
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13114
 
13115
+ for (int i = 0; i < ne0; i++) {
13116
+ for (int j = 0; j < ne1; j++) {
13117
+ for (int k = 0; k < ne2_ne3; k++) {
13118
  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13119
  float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13120
 
 
13129
  }
13130
 
13131
  pdst[0] = i * m_k + src[0];
13132
+
13133
  }
13134
  }
13135
  }
 
20200
  ggml_vec_cpy_f32(nx, xp, x);
20201
  ggml_vec_cpy_f32(nx, gp, g);
20202
 
 
 
 
 
20203
  ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
20204
  if (cancel) {
20205
  return GGML_OPT_CANCEL;
ggml.h CHANGED
@@ -326,7 +326,7 @@ extern "C" {
326
  GGML_TYPE_COUNT,
327
  };
328
 
329
- enum ggml_backend_type {
330
  GGML_BACKEND_CPU = 0,
331
  GGML_BACKEND_GPU = 10,
332
  GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,10 +479,8 @@ extern "C" {
479
 
480
  // n-dimensional tensor
481
  struct ggml_tensor {
482
- enum ggml_type type;
483
- enum ggml_backend_type backend;
484
-
485
- struct ggml_backend_buffer * buffer;
486
 
487
  int n_dims;
488
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -516,7 +514,7 @@ extern "C" {
516
 
517
  void * extra; // extra things e.g. for ggml-cuda.cu
518
 
519
- char padding[12];
520
  };
521
 
522
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -1360,7 +1358,7 @@ extern "C" {
1360
 
1361
  // alibi position embedding
1362
  // in-place, returns view(a)
1363
- GGML_API struct ggml_tensor * ggml_alibi(
1364
  struct ggml_context * ctx,
1365
  struct ggml_tensor * a,
1366
  int n_past,
@@ -1369,7 +1367,7 @@ extern "C" {
1369
 
1370
  // clamp
1371
  // in-place, returns view(a)
1372
- GGML_API struct ggml_tensor * ggml_clamp(
1373
  struct ggml_context * ctx,
1374
  struct ggml_tensor * a,
1375
  float min,
@@ -2104,7 +2102,7 @@ extern "C" {
2104
  enum ggml_type vec_dot_type;
2105
  } ggml_type_traits_t;
2106
 
2107
- GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2108
 
2109
  #ifdef __cplusplus
2110
  }
 
326
  GGML_TYPE_COUNT,
327
  };
328
 
329
+ enum ggml_backend {
330
  GGML_BACKEND_CPU = 0,
331
  GGML_BACKEND_GPU = 10,
332
  GGML_BACKEND_GPU_SPLIT = 20,
 
479
 
480
  // n-dimensional tensor
481
  struct ggml_tensor {
482
+ enum ggml_type type;
483
+ enum ggml_backend backend;
 
 
484
 
485
  int n_dims;
486
  int64_t ne[GGML_MAX_DIMS]; // number of elements
 
514
 
515
  void * extra; // extra things e.g. for ggml-cuda.cu
516
 
517
+ char padding[4];
518
  };
519
 
520
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
1358
 
1359
  // alibi position embedding
1360
  // in-place, returns view(a)
1361
+ struct ggml_tensor * ggml_alibi(
1362
  struct ggml_context * ctx,
1363
  struct ggml_tensor * a,
1364
  int n_past,
 
1367
 
1368
  // clamp
1369
  // in-place, returns view(a)
1370
+ struct ggml_tensor * ggml_clamp(
1371
  struct ggml_context * ctx,
1372
  struct ggml_tensor * a,
1373
  float min,
 
2102
  enum ggml_type vec_dot_type;
2103
  } ggml_type_traits_t;
2104
 
2105
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2106
 
2107
  #ifdef __cplusplus
2108
  }
gguf-py/gguf/gguf.py CHANGED
@@ -88,31 +88,29 @@ class MODEL_ARCH(IntEnum):
88
  PERSIMMON : int = auto()
89
  REFACT : int = auto()
90
  BERT : int = auto()
91
- BLOOM : int = auto()
92
 
93
 
94
  class MODEL_TENSOR(IntEnum):
95
- TOKEN_EMBD : int = auto()
96
- TOKEN_EMBD_NORM : int = auto()
97
- TOKEN_TYPES : int = auto()
98
- POS_EMBD : int = auto()
99
- OUTPUT : int = auto()
100
- OUTPUT_NORM : int = auto()
101
- ROPE_FREQS : int = auto()
102
- ATTN_Q : int = auto()
103
- ATTN_K : int = auto()
104
- ATTN_V : int = auto()
105
- ATTN_QKV : int = auto()
106
- ATTN_OUT : int = auto()
107
- ATTN_NORM : int = auto()
108
- ATTN_NORM_2 : int = auto()
109
- ATTN_ROT_EMBD : int = auto()
110
- FFN_GATE : int = auto()
111
- FFN_DOWN : int = auto()
112
- FFN_UP : int = auto()
113
- FFN_NORM : int = auto()
114
- ATTN_Q_NORM : int = auto()
115
- ATTN_K_NORM : int = auto()
116
 
117
 
118
  MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -127,31 +125,29 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
127
  MODEL_ARCH.PERSIMMON: "persimmon",
128
  MODEL_ARCH.REFACT: "refact",
129
  MODEL_ARCH.BERT: "bert",
130
- MODEL_ARCH.BLOOM: "bloom",
131
  }
132
 
133
  TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
134
- MODEL_TENSOR.TOKEN_EMBD: "token_embd",
135
- MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
136
- MODEL_TENSOR.TOKEN_TYPES: "token_types",
137
- MODEL_TENSOR.POS_EMBD: "position_embd",
138
- MODEL_TENSOR.OUTPUT_NORM: "output_norm",
139
- MODEL_TENSOR.OUTPUT: "output",
140
- MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
141
- MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
142
- MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
143
- MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
144
- MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
145
- MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
146
- MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
147
- MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
148
- MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
149
- MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
150
- MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
151
- MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
152
- MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
153
- MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
154
- MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
155
  }
156
 
157
  MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -286,18 +282,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
286
  MODEL_TENSOR.FFN_DOWN,
287
  MODEL_TENSOR.FFN_UP,
288
  ],
289
- MODEL_ARCH.BLOOM: [
290
- MODEL_TENSOR.TOKEN_EMBD,
291
- MODEL_TENSOR.TOKEN_EMBD_NORM,
292
- MODEL_TENSOR.OUTPUT_NORM,
293
- MODEL_TENSOR.OUTPUT,
294
- MODEL_TENSOR.ATTN_NORM,
295
- MODEL_TENSOR.ATTN_QKV,
296
- MODEL_TENSOR.ATTN_OUT,
297
- MODEL_TENSOR.FFN_NORM,
298
- MODEL_TENSOR.FFN_DOWN,
299
- MODEL_TENSOR.FFN_UP,
300
- ],
301
  MODEL_ARCH.GPT2: [
302
  # TODO
303
  ],
@@ -327,7 +311,6 @@ class TensorNameMap:
327
  "gpt_neox.embed_in", # gptneox
328
  "transformer.wte", # gpt2 gpt-j mpt refact
329
  "transformer.word_embeddings", # falcon
330
- "word_embeddings", # bloom
331
  "model.embed_tokens", # llama-hf
332
  "tok_embeddings", # llama-pth
333
  "embeddings.word_embeddings", # bert
@@ -339,11 +322,6 @@ class TensorNameMap:
339
  "embeddings.token_type_embeddings", # bert
340
  ),
341
 
342
- # Normalization of token embeddings
343
- MODEL_TENSOR.TOKEN_EMBD_NORM: (
344
- "word_embeddings_layernorm", # bloom
345
- ),
346
-
347
  # Position embeddings
348
  MODEL_TENSOR.POS_EMBD: (
349
  "transformer.wpe", # gpt2
@@ -354,7 +332,7 @@ class TensorNameMap:
354
  MODEL_TENSOR.OUTPUT: (
355
  "embed_out", # gptneox
356
  "lm_head", # gpt2 mpt falcon llama-hf baichuan
357
- "output", # llama-pth bloom
358
  "word_embeddings_for_head", # persimmon
359
  ),
360
 
@@ -366,7 +344,7 @@ class TensorNameMap:
366
  "norm", # llama-pth
367
  "embeddings.LayerNorm", # bert
368
  "transformer.norm_f", # mpt
369
- "ln_f", # refact bloom
370
  "language_model.encoder.final_layernorm", # persimmon
371
  ),
372
 
@@ -383,7 +361,6 @@ class TensorNameMap:
383
  "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
384
  "transformer.blocks.{bid}.norm_1", # mpt
385
  "transformer.h.{bid}.input_layernorm", # falcon7b
386
- "h.{bid}.input_layernorm", # bloom
387
  "transformer.h.{bid}.ln_mlp", # falcon40b
388
  "model.layers.{bid}.input_layernorm", # llama-hf
389
  "layers.{bid}.attention_norm", # llama-pth
@@ -402,7 +379,6 @@ class TensorNameMap:
402
  "transformer.h.{bid}.attn.c_attn", # gpt2
403
  "transformer.blocks.{bid}.attn.Wqkv", # mpt
404
  "transformer.h.{bid}.self_attention.query_key_value", # falcon
405
- "h.{bid}.self_attention.query_key_value", # bloom
406
  "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
407
  ),
408
 
@@ -436,7 +412,6 @@ class TensorNameMap:
436
  "transformer.h.{bid}.attn.c_proj", # gpt2 refact
437
  "transformer.blocks.{bid}.attn.out_proj", # mpt
438
  "transformer.h.{bid}.self_attention.dense", # falcon
439
- "h.{bid}.self_attention.dense", # bloom
440
  "model.layers.{bid}.self_attn.o_proj", # llama-hf
441
  "layers.{bid}.attention.wo", # llama-pth
442
  "encoder.layer.{bid}.attention.output.dense", # bert
@@ -454,7 +429,6 @@ class TensorNameMap:
454
  MODEL_TENSOR.FFN_NORM: (
455
  "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
456
  "transformer.h.{bid}.ln_2", # gpt2 refact
457
- "h.{bid}.post_attention_layernorm", # bloom
458
  "transformer.blocks.{bid}.norm_2", # mpt
459
  "model.layers.{bid}.post_attention_layernorm", # llama-hf
460
  "layers.{bid}.ffn_norm", # llama-pth
@@ -468,7 +442,6 @@ class TensorNameMap:
468
  "transformer.h.{bid}.mlp.c_fc", # gpt2
469
  "transformer.blocks.{bid}.ffn.up_proj", # mpt
470
  "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
471
- "h.{bid}.mlp.dense_h_to_4h", # bloom
472
  "model.layers.{bid}.mlp.up_proj", # llama-hf refact
473
  "layers.{bid}.feed_forward.w3", # llama-pth
474
  "encoder.layer.{bid}.intermediate.dense", # bert
@@ -488,7 +461,6 @@ class TensorNameMap:
488
  "transformer.h.{bid}.mlp.c_proj", # gpt2 refact
489
  "transformer.blocks.{bid}.ffn.down_proj", # mpt
490
  "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
491
- "h.{bid}.mlp.dense_4h_to_h", # bloom
492
  "model.layers.{bid}.mlp.down_proj", # llama-hf
493
  "layers.{bid}.feed_forward.w2", # llama-pth
494
  "encoder.layer.{bid}.output.dense", # bert
 
88
  PERSIMMON : int = auto()
89
  REFACT : int = auto()
90
  BERT : int = auto()
 
91
 
92
 
93
  class MODEL_TENSOR(IntEnum):
94
+ TOKEN_EMBD : int = auto()
95
+ TOKEN_TYPES : int = auto()
96
+ POS_EMBD : int = auto()
97
+ OUTPUT : int = auto()
98
+ OUTPUT_NORM : int = auto()
99
+ ROPE_FREQS : int = auto()
100
+ ATTN_Q : int = auto()
101
+ ATTN_K : int = auto()
102
+ ATTN_V : int = auto()
103
+ ATTN_QKV : int = auto()
104
+ ATTN_OUT : int = auto()
105
+ ATTN_NORM : int = auto()
106
+ ATTN_NORM_2 : int = auto()
107
+ ATTN_ROT_EMBD: int = auto()
108
+ FFN_GATE : int = auto()
109
+ FFN_DOWN : int = auto()
110
+ FFN_UP : int = auto()
111
+ FFN_NORM : int = auto()
112
+ ATTN_Q_NORM : int = auto()
113
+ ATTN_K_NORM : int = auto()
 
114
 
115
 
116
  MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
 
125
  MODEL_ARCH.PERSIMMON: "persimmon",
126
  MODEL_ARCH.REFACT: "refact",
127
  MODEL_ARCH.BERT: "bert",
 
128
  }
129
 
130
  TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
131
+ MODEL_TENSOR.TOKEN_EMBD: "token_embd",
132
+ MODEL_TENSOR.TOKEN_TYPES: "token_types",
133
+ MODEL_TENSOR.POS_EMBD: "position_embd",
134
+ MODEL_TENSOR.OUTPUT_NORM: "output_norm",
135
+ MODEL_TENSOR.OUTPUT: "output",
136
+ MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
137
+ MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
138
+ MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
139
+ MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
140
+ MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
141
+ MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
142
+ MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
143
+ MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
144
+ MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
145
+ MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
146
+ MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
147
+ MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
148
+ MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
149
+ MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
150
+ MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
 
151
  }
152
 
153
  MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
 
282
  MODEL_TENSOR.FFN_DOWN,
283
  MODEL_TENSOR.FFN_UP,
284
  ],
 
 
 
 
 
 
 
 
 
 
 
 
285
  MODEL_ARCH.GPT2: [
286
  # TODO
287
  ],
 
311
  "gpt_neox.embed_in", # gptneox
312
  "transformer.wte", # gpt2 gpt-j mpt refact
313
  "transformer.word_embeddings", # falcon
 
314
  "model.embed_tokens", # llama-hf
315
  "tok_embeddings", # llama-pth
316
  "embeddings.word_embeddings", # bert
 
322
  "embeddings.token_type_embeddings", # bert
323
  ),
324
 
 
 
 
 
 
325
  # Position embeddings
326
  MODEL_TENSOR.POS_EMBD: (
327
  "transformer.wpe", # gpt2
 
332
  MODEL_TENSOR.OUTPUT: (
333
  "embed_out", # gptneox
334
  "lm_head", # gpt2 mpt falcon llama-hf baichuan
335
+ "output", # llama-pth
336
  "word_embeddings_for_head", # persimmon
337
  ),
338
 
 
344
  "norm", # llama-pth
345
  "embeddings.LayerNorm", # bert
346
  "transformer.norm_f", # mpt
347
+ "ln_f", # refact
348
  "language_model.encoder.final_layernorm", # persimmon
349
  ),
350
 
 
361
  "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
362
  "transformer.blocks.{bid}.norm_1", # mpt
363
  "transformer.h.{bid}.input_layernorm", # falcon7b
 
364
  "transformer.h.{bid}.ln_mlp", # falcon40b
365
  "model.layers.{bid}.input_layernorm", # llama-hf
366
  "layers.{bid}.attention_norm", # llama-pth
 
379
  "transformer.h.{bid}.attn.c_attn", # gpt2
380
  "transformer.blocks.{bid}.attn.Wqkv", # mpt
381
  "transformer.h.{bid}.self_attention.query_key_value", # falcon
 
382
  "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
383
  ),
384
 
 
412
  "transformer.h.{bid}.attn.c_proj", # gpt2 refact
413
  "transformer.blocks.{bid}.attn.out_proj", # mpt
414
  "transformer.h.{bid}.self_attention.dense", # falcon
 
415
  "model.layers.{bid}.self_attn.o_proj", # llama-hf
416
  "layers.{bid}.attention.wo", # llama-pth
417
  "encoder.layer.{bid}.attention.output.dense", # bert
 
429
  MODEL_TENSOR.FFN_NORM: (
430
  "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
431
  "transformer.h.{bid}.ln_2", # gpt2 refact
 
432
  "transformer.blocks.{bid}.norm_2", # mpt
433
  "model.layers.{bid}.post_attention_layernorm", # llama-hf
434
  "layers.{bid}.ffn_norm", # llama-pth
 
442
  "transformer.h.{bid}.mlp.c_fc", # gpt2
443
  "transformer.blocks.{bid}.ffn.up_proj", # mpt
444
  "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
 
445
  "model.layers.{bid}.mlp.up_proj", # llama-hf refact
446
  "layers.{bid}.feed_forward.w3", # llama-pth
447
  "encoder.layer.{bid}.intermediate.dense", # bert
 
461
  "transformer.h.{bid}.mlp.c_proj", # gpt2 refact
462
  "transformer.blocks.{bid}.ffn.down_proj", # mpt
463
  "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
 
464
  "model.layers.{bid}.mlp.down_proj", # llama-hf
465
  "layers.{bid}.feed_forward.w2", # llama-pth
466
  "encoder.layer.{bid}.output.dense", # bert
gpttype_adapter.cpp CHANGED
@@ -1768,7 +1768,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1768
  int realnpredict = params.n_predict-stopper_unused_tokens;
1769
  float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
1770
  float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
1771
- printf("\nContextLimit: %d/%d, Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
1772
  fflush(stdout);
1773
  output.status = 1;
1774
  generation_finished = true;
 
1768
  int realnpredict = params.n_predict-stopper_unused_tokens;
1769
  float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
1770
  float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
1771
+ printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)", time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
1772
  fflush(stdout);
1773
  output.status = 1;
1774
  generation_finished = true;
koboldcpp.py CHANGED
@@ -184,10 +184,6 @@ def init_library():
184
  os.add_dll_directory(dir_path)
185
  os.add_dll_directory(abs_path)
186
  os.add_dll_directory(os.getcwd())
187
- if libname == lib_hipblas and "HIP_PATH" in os.environ:
188
- os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
189
- if args.debugmode == 1:
190
- print(f"HIP/ROCm SDK at {os.environ['HIP_PATH']} included in .DLL load path")
191
  handle = ctypes.CDLL(os.path.join(dir_path, libname))
192
 
193
  handle.load_model.argtypes = [load_model_inputs]
@@ -365,7 +361,7 @@ maxhordelen = 256
365
  modelbusy = threading.Lock()
366
  requestsinqueue = 0
367
  defaultport = 5001
368
- KcppVersion = "1.47"
369
  showdebug = True
370
  showsamplerwarning = True
371
  showmaxctxwarning = True
@@ -373,8 +369,6 @@ session_kudos_earned = 0
373
  session_jobs = 0
374
  session_starttime = None
375
  exitcounter = 0
376
- punishcounter = 0 #causes a timeout if too many errors
377
- rewardcounter = 0 #reduces error counts for successful jobs
378
  totalgens = 0
379
  currentusergenkey = "" #store a special key so polled streaming works even in multiuser
380
  args = None #global args
@@ -418,34 +412,16 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
418
  elif api_format==4:
419
  # translate openai chat completion messages format into one big string.
420
  messages_array = genparams.get('messages', [])
421
- adapter_obj = genparams.get('adapter', {})
422
  messages_string = ""
423
- system_message_start = adapter_obj.get("system_start", "\n### Instruction:\n")
424
- system_message_end = adapter_obj.get("system_end", "")
425
- user_message_start = adapter_obj.get("user_start", "\n### Instruction:\n")
426
- user_message_end = adapter_obj.get("user_end", "")
427
- assistant_message_start = adapter_obj.get("assistant_start", "\n### Response:\n")
428
- assistant_message_end = adapter_obj.get("assistant_end", "")
429
-
430
  for message in messages_array:
431
  if message['role'] == "system":
432
- messages_string += system_message_start
433
  elif message['role'] == "user":
434
- messages_string += user_message_start
435
  elif message['role'] == "assistant":
436
- messages_string += assistant_message_start
437
-
438
- messages_string += message['content']
439
-
440
- if message['role'] == "system":
441
- messages_string += system_message_end
442
- elif message['role'] == "user":
443
- messages_string += user_message_end
444
- elif message['role'] == "assistant":
445
- messages_string += assistant_message_end
446
-
447
- messages_string += assistant_message_start
448
-
449
  genparams["prompt"] = messages_string
450
  frqp = genparams.get('frequency_penalty', 0.1)
451
  scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
@@ -521,9 +497,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
521
  async def handle_sse_stream(self, api_format):
522
  global friendlymodelname
523
  self.send_response(200)
524
- self.send_header("cache-control", "no-cache")
525
- self.send_header("connection", "keep-alive")
526
- self.end_headers(content_type='text/event-stream')
527
 
528
  current_token = 0
529
  incomplete_token_buffer = bytearray()
@@ -590,10 +566,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
590
  global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
591
  self.path = self.path.rstrip('/')
592
  response_body = None
593
- content_type = 'application/json'
594
 
595
  if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
596
- content_type = 'text/html'
597
  if self.embedded_kailite is None:
598
  response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
599
  else:
@@ -639,9 +615,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
639
 
640
  elif self.path.endswith('/v1/models'):
641
  response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
 
642
 
643
  elif self.path=="/api":
644
- content_type = 'text/html'
645
  if self.embedded_kcpp_docs is None:
646
  response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
647
  else:
@@ -649,40 +625,41 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
649
  elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
650
  self.path = "/api"
651
  self.send_response(302)
652
- self.send_header("location", self.path)
653
- self.end_headers(content_type='text/html')
654
  return None
655
 
656
  if response_body is None:
657
  self.send_response(404)
658
- self.end_headers(content_type='text/html')
659
  rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
660
  self.wfile.write(rp.encode())
661
  else:
662
  self.send_response(200)
663
- self.send_header('content-length', str(len(response_body)))
664
- self.end_headers(content_type=content_type)
665
  self.wfile.write(response_body)
666
  return
667
 
668
  def do_POST(self):
669
  global modelbusy, requestsinqueue, currentusergenkey, totalgens
670
- content_length = int(self.headers['content-length'])
671
  body = self.rfile.read(content_length)
672
  self.path = self.path.rstrip('/')
 
673
  if self.path.endswith(('/api/extra/tokencount')):
674
  try:
675
  genparams = json.loads(body)
676
  countprompt = genparams.get('prompt', "")
677
  count = handle.token_count(countprompt.encode("UTF-8"))
678
  self.send_response(200)
679
- self.end_headers(content_type='application/json')
680
  self.wfile.write(json.dumps({"value": count}).encode())
681
 
682
  except ValueError as e:
683
  utfprint("Count Tokens - Body Error: " + str(e))
684
  self.send_response(400)
685
- self.end_headers(content_type='application/json')
686
  self.wfile.write(json.dumps({"value": -1}).encode())
687
  return
688
 
@@ -695,11 +672,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
695
  multiuserkey = ""
696
  pass
697
 
698
- if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
699
  ag = handle.abort_generate()
700
  time.sleep(0.3) #short delay before replying
701
  self.send_response(200)
702
- self.end_headers(content_type='application/json')
703
  self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
704
  print("\nGeneration Aborted")
705
  else:
@@ -717,11 +694,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
717
  pass
718
 
719
  if totalgens>0:
720
- if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
721
  pendtxt = handle.get_pending_output()
722
  pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
723
  self.send_response(200)
724
- self.end_headers(content_type='application/json')
725
  self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
726
  return
727
 
@@ -731,7 +708,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
731
  requestsinqueue += 1
732
  if not modelbusy.acquire(blocking=reqblocking):
733
  self.send_response(503)
734
- self.end_headers(content_type='application/json')
735
  self.wfile.write(json.dumps({"detail": {
736
  "msg": "Server is busy; please try again later.",
737
  "type": "service_unavailable",
@@ -757,9 +734,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
757
 
758
  if self.path.endswith('/v1/completions'):
759
  api_format = 3
 
760
 
761
  if self.path.endswith('/v1/chat/completions'):
762
  api_format = 4
 
763
 
764
  if api_format > 0:
765
  genparams = None
@@ -785,8 +764,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
785
  # Headers are already sent when streaming
786
  if not sse_stream_flag:
787
  self.send_response(200)
788
- self.end_headers(content_type='application/json')
789
- self.wfile.write(json.dumps(gen).encode())
790
  except:
791
  print("Generate: The response could not be sent, maybe connection was terminated?")
792
  return
@@ -794,23 +773,27 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
794
  modelbusy.release()
795
 
796
  self.send_response(404)
797
- self.end_headers(content_type='text/html')
798
 
799
 
800
  def do_OPTIONS(self):
801
  self.send_response(200)
802
- self.end_headers(content_type='text/html')
803
 
804
  def do_HEAD(self):
805
  self.send_response(200)
806
- self.end_headers(content_type='text/html')
807
-
808
- def end_headers(self, content_type=None):
809
- self.send_header('access-control-allow-origin', '*')
810
- self.send_header('access-control-allow-methods', '*')
811
- self.send_header('access-control-allow-headers', '*, Accept, Content-Type, Content-Length, Accept-Encoding, X-CSRF-Token, Client-Agent, X-Fields, Content-Type, Authorization, X-Requested-With, X-HTTP-Method-Override, apikey, genkey')
812
- if content_type is not None:
813
- self.send_header('content-type', content_type)
 
 
 
 
814
  return super(ServerRequestHandler, self).end_headers()
815
 
816
 
@@ -1034,8 +1017,7 @@ def show_new_gui():
1034
  mmq_var = ctk.IntVar(value=1)
1035
  blas_threads_var = ctk.StringVar()
1036
  blas_size_var = ctk.IntVar()
1037
- version_var = ctk.StringVar(value="0")
1038
- tensor_split_str_vars = ctk.StringVar(value="")
1039
 
1040
  smartcontext = ctk.IntVar()
1041
  context_var = ctk.IntVar()
@@ -1087,15 +1069,11 @@ def show_new_gui():
1087
  quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
1088
  mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
1089
  quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
1090
- tensor_split_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
1091
- tensor_split_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
1092
  else:
1093
  lowvram_box.grid_forget()
1094
  quick_lowvram_box.grid_forget()
1095
  mmq_box.grid_forget()
1096
  quick_mmq_box.grid_forget()
1097
- tensor_split_label.grid_forget()
1098
- tensor_split_entry.grid_forget()
1099
 
1100
  if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
1101
  gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
@@ -1108,7 +1086,6 @@ def show_new_gui():
1108
  quick_gpu_layers_label.grid_forget()
1109
  quick_gpu_layers_entry.grid_forget()
1110
 
1111
-
1112
  # presets selector
1113
  makelabel(quick_tab, "Presets:", 1)
1114
 
@@ -1141,7 +1118,7 @@ def show_new_gui():
1141
  makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
1142
 
1143
  # load model
1144
- makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170)
1145
 
1146
  # Hardware Tab
1147
  hardware_tab = tabcontent["Hardware"]
@@ -1160,7 +1137,6 @@ def show_new_gui():
1160
  gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
1161
  CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
1162
  gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
1163
- tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 6, 80)
1164
  lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
1165
  mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
1166
 
@@ -1209,7 +1185,7 @@ def show_new_gui():
1209
  # Model Tab
1210
  model_tab = tabcontent["Model"]
1211
 
1212
- makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1)
1213
  makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
1214
  makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
1215
 
@@ -1289,12 +1265,6 @@ def show_new_gui():
1289
  args.noavx2 = True
1290
  args.noblas = True
1291
  args.nommap = True
1292
- if tensor_split_str_vars.get()!="":
1293
- tssv = tensor_split_str_vars.get()
1294
- if "," in tssv:
1295
- args.tensor_split = [float(x) for x in tssv.split(",")]
1296
- else:
1297
- args.tensor_split = [float(x) for x in tssv.split(" ")]
1298
 
1299
  args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
1300
 
@@ -1359,9 +1329,6 @@ def show_new_gui():
1359
  runopts_var.set(openblas_option)
1360
  if "gpulayers" in dict and dict["gpulayers"]:
1361
  gpulayers_var.set(dict["gpulayers"])
1362
- if "tensor_split" in dict and dict["tensor_split"]:
1363
- tssep = ','.join(map(str, dict["tensor_split"]))
1364
- tensor_split_str_vars.set(tssep)
1365
  if "blasthreads" in dict and dict["blasthreads"]:
1366
  blas_threads_var.set(str(dict["blasthreads"]))
1367
  else:
@@ -1480,7 +1447,7 @@ def show_gui_msgbox(title,message):
1480
  def run_horde_worker(args, api_key, worker_name):
1481
  import urllib.request
1482
  from datetime import datetime
1483
- global friendlymodelname, maxhordectx, maxhordelen, exitcounter, punishcounter, modelbusy, session_starttime
1484
  epurl = f"http://localhost:{args.port}"
1485
  if args.host!="":
1486
  epurl = f"http://{args.host}:{args.port}"
@@ -1489,11 +1456,10 @@ def run_horde_worker(args, api_key, worker_name):
1489
  print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
1490
 
1491
  def submit_completed_generation(url, jobid, sessionstart, submit_dict):
1492
- global exitcounter, punishcounter, session_kudos_earned, session_jobs, rewardcounter
1493
  reply = make_url_request(url, submit_dict)
1494
  if not reply:
1495
  exitcounter += 1
1496
- punishcounter += 1
1497
  print_with_time(f"Error, Job submit failed.")
1498
  else:
1499
  reward = reply["reward"]
@@ -1507,11 +1473,6 @@ def run_horde_worker(args, api_key, worker_name):
1507
  elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
1508
  earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
1509
  print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
1510
- rewardcounter += 1
1511
- if rewardcounter > 50:
1512
- rewardcounter = 0
1513
- if exitcounter > 5:
1514
- exitcounter -= 1
1515
 
1516
  def make_url_request(url, data, method='POST'):
1517
  try:
@@ -1520,7 +1481,7 @@ def run_horde_worker(args, api_key, worker_name):
1520
  if method=='POST':
1521
  json_payload = json.dumps(data).encode('utf-8')
1522
  request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
1523
- request.add_header('content-type', 'application/json')
1524
  else:
1525
  request = urllib.request.Request(url, headers=headers, method=method)
1526
  response_data = ""
@@ -1547,23 +1508,17 @@ def run_horde_worker(args, api_key, worker_name):
1547
  print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
1548
  BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
1549
  cluster = "https://horde.koboldai.net"
1550
- while exitcounter < 35:
1551
  time.sleep(3)
1552
  readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
1553
  if readygo:
1554
  print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
1555
  break
1556
 
1557
- while exitcounter < 35:
1558
  currentjob_attempts = 0
1559
  current_generation = None
1560
 
1561
- if punishcounter >= 10:
1562
- punishcounter = 0
1563
- print_with_time(f"Horde Worker Paused for 10 min - Too many errors. It will resume automatically.")
1564
- print_with_time(f"Caution: Too many failed jobs may lead to entering maintenance mode.")
1565
- time.sleep(600)
1566
-
1567
  #first, make sure we are not generating
1568
  if modelbusy.locked():
1569
  time.sleep(0.2)
@@ -1582,7 +1537,6 @@ def run_horde_worker(args, api_key, worker_name):
1582
  pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
1583
  if not pop:
1584
  exitcounter += 1
1585
- punishcounter += 1
1586
  print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
1587
  time.sleep(5)
1588
  continue
@@ -1601,7 +1555,7 @@ def run_horde_worker(args, api_key, worker_name):
1601
  print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
1602
 
1603
  #do gen
1604
- while exitcounter < 35:
1605
  if not modelbusy.locked():
1606
  current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
1607
  if current_generation:
@@ -1926,10 +1880,4 @@ if __name__ == '__main__':
1926
  parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
1927
  parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
1928
 
1929
- # #deprecated hidden args. they do nothing. do not use
1930
- # parser.add_argument("--psutil_set_threads", action='store_true', help=argparse.SUPPRESS)
1931
- # parser.add_argument("--stream", action='store_true', help=argparse.SUPPRESS)
1932
- # parser.add_argument("--unbantokens", action='store_true', help=argparse.SUPPRESS)
1933
- # parser.add_argument("--usemirostat", action='store_true', help=argparse.SUPPRESS)
1934
-
1935
  main(parser.parse_args(),start_server=True)
 
184
  os.add_dll_directory(dir_path)
185
  os.add_dll_directory(abs_path)
186
  os.add_dll_directory(os.getcwd())
 
 
 
 
187
  handle = ctypes.CDLL(os.path.join(dir_path, libname))
188
 
189
  handle.load_model.argtypes = [load_model_inputs]
 
361
  modelbusy = threading.Lock()
362
  requestsinqueue = 0
363
  defaultport = 5001
364
+ KcppVersion = "1.46.1"
365
  showdebug = True
366
  showsamplerwarning = True
367
  showmaxctxwarning = True
 
369
  session_jobs = 0
370
  session_starttime = None
371
  exitcounter = 0
 
 
372
  totalgens = 0
373
  currentusergenkey = "" #store a special key so polled streaming works even in multiuser
374
  args = None #global args
 
412
  elif api_format==4:
413
  # translate openai chat completion messages format into one big string.
414
  messages_array = genparams.get('messages', [])
 
415
  messages_string = ""
 
 
 
 
 
 
 
416
  for message in messages_array:
417
  if message['role'] == "system":
418
+ messages_string+="\n### Instruction:\n"
419
  elif message['role'] == "user":
420
+ messages_string+="\n### Instruction:\n"
421
  elif message['role'] == "assistant":
422
+ messages_string+="\n### Response:\n"
423
+ messages_string+=message['content']
424
+ messages_string += "\n### Response:\n"
 
 
 
 
 
 
 
 
 
 
425
  genparams["prompt"] = messages_string
426
  frqp = genparams.get('frequency_penalty', 0.1)
427
  scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
 
497
  async def handle_sse_stream(self, api_format):
498
  global friendlymodelname
499
  self.send_response(200)
500
+ self.send_header("Cache-Control", "no-cache")
501
+ self.send_header("Connection", "keep-alive")
502
+ self.end_headers(force_json=True, sse_stream_flag=True)
503
 
504
  current_token = 0
505
  incomplete_token_buffer = bytearray()
 
566
  global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
567
  self.path = self.path.rstrip('/')
568
  response_body = None
569
+ force_json = False
570
 
571
  if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
572
+
573
  if self.embedded_kailite is None:
574
  response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
575
  else:
 
615
 
616
  elif self.path.endswith('/v1/models'):
617
  response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
618
+ force_json = True
619
 
620
  elif self.path=="/api":
 
621
  if self.embedded_kcpp_docs is None:
622
  response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
623
  else:
 
625
  elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
626
  self.path = "/api"
627
  self.send_response(302)
628
+ self.send_header("Location", self.path)
629
+ self.end_headers()
630
  return None
631
 
632
  if response_body is None:
633
  self.send_response(404)
634
+ self.end_headers()
635
  rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
636
  self.wfile.write(rp.encode())
637
  else:
638
  self.send_response(200)
639
+ self.send_header('Content-Length', str(len(response_body)))
640
+ self.end_headers(force_json=force_json)
641
  self.wfile.write(response_body)
642
  return
643
 
644
  def do_POST(self):
645
  global modelbusy, requestsinqueue, currentusergenkey, totalgens
646
+ content_length = int(self.headers['Content-Length'])
647
  body = self.rfile.read(content_length)
648
  self.path = self.path.rstrip('/')
649
+ force_json = False
650
  if self.path.endswith(('/api/extra/tokencount')):
651
  try:
652
  genparams = json.loads(body)
653
  countprompt = genparams.get('prompt', "")
654
  count = handle.token_count(countprompt.encode("UTF-8"))
655
  self.send_response(200)
656
+ self.end_headers()
657
  self.wfile.write(json.dumps({"value": count}).encode())
658
 
659
  except ValueError as e:
660
  utfprint("Count Tokens - Body Error: " + str(e))
661
  self.send_response(400)
662
+ self.end_headers()
663
  self.wfile.write(json.dumps({"value": -1}).encode())
664
  return
665
 
 
672
  multiuserkey = ""
673
  pass
674
 
675
+ if (multiuserkey!="" and multiuserkey==currentusergenkey) or requestsinqueue==0:
676
  ag = handle.abort_generate()
677
  time.sleep(0.3) #short delay before replying
678
  self.send_response(200)
679
+ self.end_headers()
680
  self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
681
  print("\nGeneration Aborted")
682
  else:
 
694
  pass
695
 
696
  if totalgens>0:
697
+ if (multiuserkey!="" and multiuserkey==currentusergenkey) or requestsinqueue==0:
698
  pendtxt = handle.get_pending_output()
699
  pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
700
  self.send_response(200)
701
+ self.end_headers()
702
  self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
703
  return
704
 
 
708
  requestsinqueue += 1
709
  if not modelbusy.acquire(blocking=reqblocking):
710
  self.send_response(503)
711
+ self.end_headers()
712
  self.wfile.write(json.dumps({"detail": {
713
  "msg": "Server is busy; please try again later.",
714
  "type": "service_unavailable",
 
734
 
735
  if self.path.endswith('/v1/completions'):
736
  api_format = 3
737
+ force_json = True
738
 
739
  if self.path.endswith('/v1/chat/completions'):
740
  api_format = 4
741
+ force_json = True
742
 
743
  if api_format > 0:
744
  genparams = None
 
764
  # Headers are already sent when streaming
765
  if not sse_stream_flag:
766
  self.send_response(200)
767
+ self.end_headers(force_json=force_json)
768
+ self.wfile.write(json.dumps(gen).encode())
769
  except:
770
  print("Generate: The response could not be sent, maybe connection was terminated?")
771
  return
 
773
  modelbusy.release()
774
 
775
  self.send_response(404)
776
+ self.end_headers()
777
 
778
 
779
  def do_OPTIONS(self):
780
  self.send_response(200)
781
+ self.end_headers()
782
 
783
  def do_HEAD(self):
784
  self.send_response(200)
785
+ self.end_headers()
786
+
787
+ def end_headers(self, force_json=False, sse_stream_flag=False):
788
+ self.send_header('Access-Control-Allow-Origin', '*')
789
+ self.send_header('Access-Control-Allow-Methods', '*')
790
+ self.send_header('Access-Control-Allow-Headers', '*')
791
+ if ("/api" in self.path and self.path!="/api") or force_json:
792
+ if sse_stream_flag:
793
+ self.send_header('Content-type', 'text/event-stream')
794
+ self.send_header('Content-type', 'application/json')
795
+ else:
796
+ self.send_header('Content-type', 'text/html')
797
  return super(ServerRequestHandler, self).end_headers()
798
 
799
 
 
1017
  mmq_var = ctk.IntVar(value=1)
1018
  blas_threads_var = ctk.StringVar()
1019
  blas_size_var = ctk.IntVar()
1020
+ version_var =ctk.StringVar(value="0")
 
1021
 
1022
  smartcontext = ctk.IntVar()
1023
  context_var = ctk.IntVar()
 
1069
  quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
1070
  mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
1071
  quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
 
 
1072
  else:
1073
  lowvram_box.grid_forget()
1074
  quick_lowvram_box.grid_forget()
1075
  mmq_box.grid_forget()
1076
  quick_mmq_box.grid_forget()
 
 
1077
 
1078
  if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
1079
  gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
 
1086
  quick_gpu_layers_label.grid_forget()
1087
  quick_gpu_layers_entry.grid_forget()
1088
 
 
1089
  # presets selector
1090
  makelabel(quick_tab, "Presets:", 1)
1091
 
 
1118
  makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
1119
 
1120
  # load model
1121
+ makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170,filetypes=[("GGML Model Files", "*.gguf;*.bin;*.ggml")])
1122
 
1123
  # Hardware Tab
1124
  hardware_tab = tabcontent["Hardware"]
 
1137
  gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
1138
  CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
1139
  gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
 
1140
  lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
1141
  mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
1142
 
 
1185
  # Model Tab
1186
  model_tab = tabcontent["Model"]
1187
 
1188
+ makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1, filetypes=[("GGML Model Files", "*.gguf;*.bin;*.ggml")])
1189
  makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
1190
  makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
1191
 
 
1265
  args.noavx2 = True
1266
  args.noblas = True
1267
  args.nommap = True
 
 
 
 
 
 
1268
 
1269
  args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
1270
 
 
1329
  runopts_var.set(openblas_option)
1330
  if "gpulayers" in dict and dict["gpulayers"]:
1331
  gpulayers_var.set(dict["gpulayers"])
 
 
 
1332
  if "blasthreads" in dict and dict["blasthreads"]:
1333
  blas_threads_var.set(str(dict["blasthreads"]))
1334
  else:
 
1447
  def run_horde_worker(args, api_key, worker_name):
1448
  import urllib.request
1449
  from datetime import datetime
1450
+ global friendlymodelname, maxhordectx, maxhordelen, exitcounter, modelbusy, session_starttime
1451
  epurl = f"http://localhost:{args.port}"
1452
  if args.host!="":
1453
  epurl = f"http://{args.host}:{args.port}"
 
1456
  print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
1457
 
1458
  def submit_completed_generation(url, jobid, sessionstart, submit_dict):
1459
+ global exitcounter, session_kudos_earned, session_jobs
1460
  reply = make_url_request(url, submit_dict)
1461
  if not reply:
1462
  exitcounter += 1
 
1463
  print_with_time(f"Error, Job submit failed.")
1464
  else:
1465
  reward = reply["reward"]
 
1473
  elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
1474
  earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
1475
  print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
 
 
 
 
 
1476
 
1477
  def make_url_request(url, data, method='POST'):
1478
  try:
 
1481
  if method=='POST':
1482
  json_payload = json.dumps(data).encode('utf-8')
1483
  request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
1484
+ request.add_header('Content-Type', 'application/json')
1485
  else:
1486
  request = urllib.request.Request(url, headers=headers, method=method)
1487
  response_data = ""
 
1508
  print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
1509
  BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
1510
  cluster = "https://horde.koboldai.net"
1511
+ while exitcounter < 10:
1512
  time.sleep(3)
1513
  readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
1514
  if readygo:
1515
  print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
1516
  break
1517
 
1518
+ while exitcounter < 10:
1519
  currentjob_attempts = 0
1520
  current_generation = None
1521
 
 
 
 
 
 
 
1522
  #first, make sure we are not generating
1523
  if modelbusy.locked():
1524
  time.sleep(0.2)
 
1537
  pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
1538
  if not pop:
1539
  exitcounter += 1
 
1540
  print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
1541
  time.sleep(5)
1542
  continue
 
1555
  print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
1556
 
1557
  #do gen
1558
+ while exitcounter < 10:
1559
  if not modelbusy.locked():
1560
  current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
1561
  if current_generation:
 
1880
  parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
1881
  parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
1882
 
 
 
 
 
 
 
1883
  main(parser.parse_args(),start_server=True)
llama.cpp CHANGED
@@ -189,7 +189,6 @@ enum llm_arch {
189
  LLM_ARCH_STARCODER,
190
  LLM_ARCH_PERSIMMON,
191
  LLM_ARCH_REFACT,
192
- LLM_ARCH_BLOOM,
193
  LLM_ARCH_UNKNOWN,
194
  };
195
 
@@ -203,8 +202,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
203
  { LLM_ARCH_BAICHUAN, "baichuan" },
204
  { LLM_ARCH_STARCODER, "starcoder" },
205
  { LLM_ARCH_PERSIMMON, "persimmon" },
206
- { LLM_ARCH_REFACT, "refact" },
207
- { LLM_ARCH_BLOOM, "bloom" },
208
  };
209
 
210
  enum llm_kv {
@@ -307,7 +305,6 @@ struct LLM_KV {
307
 
308
  enum llm_tensor {
309
  LLM_TENSOR_TOKEN_EMBD,
310
- LLM_TENSOR_TOKEN_EMBD_NORM,
311
  LLM_TENSOR_POS_EMBD,
312
  LLM_TENSOR_OUTPUT,
313
  LLM_TENSOR_OUTPUT_NORM,
@@ -428,14 +425,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
428
  LLM_ARCH_MPT,
429
  {
430
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
431
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
432
- { LLM_TENSOR_OUTPUT, "output" },
433
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
434
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
435
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
436
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
437
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
438
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
439
  },
440
  },
441
  {
@@ -470,21 +459,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
470
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
471
  },
472
  },
473
- {
474
- LLM_ARCH_BLOOM,
475
- {
476
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
477
- { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
478
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
479
- { LLM_TENSOR_OUTPUT, "output" },
480
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
481
- { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
482
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
483
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
484
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
485
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
486
- },
487
- },
488
  {
489
  LLM_ARCH_UNKNOWN,
490
  {
@@ -1042,9 +1016,6 @@ struct llama_hparams {
1042
  float rope_freq_base_train;
1043
  float rope_freq_scale_train;
1044
 
1045
- float f_clamp_kqv;
1046
- float f_max_alibi_bias;
1047
-
1048
  bool operator!=(const llama_hparams & other) const {
1049
  if (this->vocab_only != other.vocab_only) return true;
1050
  if (this->n_vocab != other.n_vocab) return true;
@@ -1230,8 +1201,6 @@ struct llama_model {
1230
 
1231
  struct ggml_tensor * tok_embeddings;
1232
  struct ggml_tensor * pos_embeddings;
1233
- struct ggml_tensor * tok_norm;
1234
- struct ggml_tensor * tok_norm_b;
1235
 
1236
  struct ggml_tensor * output_norm;
1237
  struct ggml_tensor * output_norm_b;
@@ -1361,11 +1330,7 @@ static bool llama_kv_cache_init(
1361
  cache.cells.clear();
1362
  cache.cells.resize(n_ctx);
1363
 
1364
- // TODO: this should be:
1365
- // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
1366
- // change it and test that it works
1367
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
1368
- memset(cache.buf.data, 0, cache.buf.size);
1369
 
1370
  struct ggml_init_params params;
1371
  params.mem_size = cache.buf.size;
@@ -1771,7 +1736,7 @@ struct llama_model_loader {
1771
  }
1772
  }
1773
 
1774
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
1775
  if (backend != GGML_BACKEND_CPU) {
1776
  ggml_set_no_alloc(ctx, true);
1777
  }
@@ -1789,7 +1754,7 @@ struct llama_model_loader {
1789
  return tensor;
1790
  }
1791
 
1792
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
1793
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1794
 
1795
  if (cur == NULL) {
@@ -2082,13 +2047,13 @@ static void llm_load_hparams(
2082
  }
2083
  } break;
2084
  case LLM_ARCH_PERSIMMON:
2085
- {
2086
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2087
- switch (hparams.n_layer) {
2088
- case 36: model.type = e_model::MODEL_8B; break;
2089
- default: model.type = e_model::MODEL_UNKNOWN;
2090
- }
2091
- } break;
2092
  case LLM_ARCH_REFACT:
2093
  {
2094
  GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -2097,33 +2062,6 @@ static void llm_load_hparams(
2097
  default: model.type = e_model::MODEL_UNKNOWN;
2098
  }
2099
  } break;
2100
- case LLM_ARCH_BLOOM:
2101
- {
2102
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2103
-
2104
- switch (hparams.n_layer) {
2105
- case 24: model.type = e_model::MODEL_1B; break;
2106
- case 30:
2107
- switch (hparams.n_embd) {
2108
- case 2560: model.type = e_model::MODEL_3B; break;
2109
- case 4096: model.type = e_model::MODEL_7B; break;
2110
- } break;
2111
- }
2112
- } break;
2113
- case LLM_ARCH_MPT:
2114
- {
2115
- hparams.f_clamp_kqv = 0.0f;
2116
-
2117
- GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2118
- GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
2119
- GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
2120
-
2121
- switch (hparams.n_layer) {
2122
- case 32: model.type = e_model::MODEL_7B; break;
2123
- case 48: model.type = e_model::MODEL_30B; break;
2124
- default: model.type = e_model::MODEL_UNKNOWN;
2125
- }
2126
- } break;
2127
  default: (void)0;
2128
  }
2129
 
@@ -2268,8 +2206,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
2268
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2269
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2270
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
2271
- LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
2272
- LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
2273
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2274
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2275
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
@@ -2369,8 +2305,8 @@ static void llm_load_tensors(
2369
 
2370
  // output
2371
  {
2372
- ggml_backend_type backend_norm;
2373
- ggml_backend_type backend_output;
2374
 
2375
  if (n_gpu_layers > int(n_layer)) {
2376
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2405,8 +2341,8 @@ static void llm_load_tensors(
2405
  model.layers.resize(n_layer);
2406
 
2407
  for (uint32_t i = 0; i < n_layer; ++i) {
2408
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2409
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2410
 
2411
  auto & layer = model.layers[i];
2412
 
@@ -2435,8 +2371,8 @@ static void llm_load_tensors(
2435
  {
2436
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2437
  {
2438
- ggml_backend_type backend_norm;
2439
- ggml_backend_type backend_output;
2440
 
2441
  if (n_gpu_layers > int(n_layer)) {
2442
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2471,8 +2407,8 @@ static void llm_load_tensors(
2471
  model.layers.resize(n_layer);
2472
 
2473
  for (uint32_t i = 0; i < n_layer; ++i) {
2474
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2475
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2476
 
2477
  auto & layer = model.layers[i];
2478
 
@@ -2505,8 +2441,8 @@ static void llm_load_tensors(
2505
 
2506
  // output
2507
  {
2508
- ggml_backend_type backend_norm;
2509
- ggml_backend_type backend_output;
2510
 
2511
  if (n_gpu_layers > int(n_layer)) {
2512
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2543,8 +2479,8 @@ static void llm_load_tensors(
2543
  model.layers.resize(n_layer);
2544
 
2545
  for (uint32_t i = 0; i < n_layer; ++i) {
2546
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2547
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2548
 
2549
  auto & layer = model.layers[i];
2550
 
@@ -2582,8 +2518,8 @@ static void llm_load_tensors(
2582
 
2583
  // output
2584
  {
2585
- ggml_backend_type backend_norm;
2586
- ggml_backend_type backend_output;
2587
 
2588
  if (n_gpu_layers > int(n_layer)) {
2589
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2620,8 +2556,8 @@ static void llm_load_tensors(
2620
  model.layers.resize(n_layer);
2621
 
2622
  for (uint32_t i = 0; i < n_layer; ++i) {
2623
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2624
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2625
 
2626
  auto & layer = model.layers[i];
2627
 
@@ -2659,8 +2595,8 @@ static void llm_load_tensors(
2659
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2660
 
2661
  {
2662
- ggml_backend_type backend_norm;
2663
- ggml_backend_type backend_output;
2664
 
2665
  if (n_gpu_layers > int(n_layer)) {
2666
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
@@ -2694,8 +2630,8 @@ static void llm_load_tensors(
2694
  const int i_gpu_start = n_layer - n_gpu_layers;
2695
  model.layers.resize(n_layer);
2696
  for (uint32_t i = 0; i < n_layer; ++i) {
2697
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2698
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2699
  auto & layer = model.layers[i];
2700
  layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2701
  layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
@@ -2715,155 +2651,6 @@ static void llm_load_tensors(
2715
  layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2716
  }
2717
  } break;
2718
- case LLM_ARCH_BLOOM:
2719
- {
2720
- // TODO: CPU-only for now
2721
-
2722
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2723
- model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
2724
- model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
2725
-
2726
- // output
2727
- {
2728
- ggml_backend_type backend_norm;
2729
- ggml_backend_type backend_output;
2730
-
2731
- if (n_gpu_layers > int(n_layer)) {
2732
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2733
- // on Windows however this is detrimental unless everything is on the GPU
2734
- #ifndef _WIN32
2735
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2736
- #else
2737
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2738
- #endif // _WIN32
2739
-
2740
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2741
- } else {
2742
- backend_norm = GGML_BACKEND_CPU;
2743
- backend_output = GGML_BACKEND_CPU;
2744
- }
2745
-
2746
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2747
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
2748
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2749
-
2750
- if (backend_norm == GGML_BACKEND_GPU) {
2751
- vram_weights += ggml_nbytes(model.output_norm);
2752
- vram_weights += ggml_nbytes(model.output_norm_b);
2753
- }
2754
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2755
- vram_weights += ggml_nbytes(model.output);
2756
- }
2757
- }
2758
-
2759
- const uint32_t n_ff = hparams.n_ff;
2760
-
2761
- const int i_gpu_start = n_layer - n_gpu_layers;
2762
-
2763
- model.layers.resize(n_layer);
2764
-
2765
- for (uint32_t i = 0; i < n_layer; ++i) {
2766
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2767
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2768
-
2769
- auto & layer = model.layers[i];
2770
-
2771
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2772
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
2773
-
2774
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2775
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
2776
-
2777
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2778
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
2779
-
2780
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2781
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
2782
-
2783
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
2784
- layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
2785
-
2786
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2787
- layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
2788
-
2789
- if (backend == GGML_BACKEND_GPU) {
2790
- vram_weights +=
2791
- ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2792
- ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
2793
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
2794
- ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
2795
- ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
2796
- ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
2797
- }
2798
- }
2799
- } break;
2800
- case LLM_ARCH_MPT:
2801
- {
2802
- model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2803
-
2804
- // output
2805
- {
2806
- ggml_backend_type backend_norm;
2807
- ggml_backend_type backend_output;
2808
-
2809
- if (n_gpu_layers > int(n_layer)) {
2810
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2811
- // on Windows however this is detrimental unless everything is on the GPU
2812
- #ifndef _WIN32
2813
- backend_norm = LLAMA_BACKEND_OFFLOAD;
2814
- #else
2815
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2816
- #endif // _WIN32
2817
-
2818
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2819
- } else {
2820
- backend_norm = GGML_BACKEND_CPU;
2821
- backend_output = GGML_BACKEND_CPU;
2822
- }
2823
-
2824
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2825
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2826
-
2827
- if (backend_norm == GGML_BACKEND_GPU) {
2828
- vram_weights += ggml_nbytes(model.output_norm);
2829
- }
2830
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2831
- vram_weights += ggml_nbytes(model.output);
2832
- }
2833
- }
2834
-
2835
- const uint32_t n_ff = hparams.n_ff;
2836
-
2837
- const int i_gpu_start = n_layer - n_gpu_layers;
2838
-
2839
- model.layers.resize(n_layer);
2840
-
2841
- for (uint32_t i = 0; i < n_layer; ++i) {
2842
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2843
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2844
-
2845
- auto & layer = model.layers[i];
2846
-
2847
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2848
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
2849
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2850
-
2851
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2852
-
2853
- layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2854
- layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2855
-
2856
- if (backend == GGML_BACKEND_GPU) {
2857
- vram_weights +=
2858
- ggml_nbytes(layer.attn_norm) +
2859
- ggml_nbytes(layer.wqkv) +
2860
- ggml_nbytes(layer.wo) +
2861
- ggml_nbytes(layer.ffn_norm) +
2862
- ggml_nbytes(layer.w2) +
2863
- ggml_nbytes(layer.w3);
2864
- }
2865
- }
2866
- } break;
2867
  default:
2868
  throw std::runtime_error("unknown architecture");
2869
  }
@@ -4720,6 +4507,7 @@ static struct ggml_cgraph * llm_build_starcoder(
4720
  return gf;
4721
  }
4722
 
 
4723
  static struct ggml_cgraph * llm_build_persimmon(
4724
  llama_context & lctx,
4725
  const llama_batch & batch) {
@@ -5117,571 +4905,12 @@ static struct ggml_cgraph * llm_build_persimmon(
5117
  return gf;
5118
  }
5119
 
5120
- static struct ggml_cgraph * llm_build_bloom(
5121
  llama_context & lctx,
5122
  const llama_batch & batch) {
5123
- const auto & model = lctx.model;
5124
- const auto & hparams = model.hparams;
5125
- const auto & cparams = lctx.cparams;
5126
-
5127
- const auto & kv_self = lctx.kv_self;
5128
-
5129
- GGML_ASSERT(!!kv_self.ctx);
5130
-
5131
- const int64_t n_embd = hparams.n_embd;
5132
- const int64_t n_layer = hparams.n_layer;
5133
- const int64_t n_ctx = cparams.n_ctx;
5134
- const int64_t n_head = hparams.n_head;
5135
- const int64_t n_head_kv = hparams.n_head_kv;
5136
- const int64_t n_embd_head = hparams.n_embd_head();
5137
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
5138
-
5139
- GGML_ASSERT(n_embd_head == hparams.n_rot);
5140
-
5141
- const float norm_eps = hparams.f_norm_eps;
5142
-
5143
- const int32_t n_tokens = batch.n_tokens;
5144
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5145
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5146
-
5147
- auto & buf_compute = lctx.buf_compute;
5148
-
5149
- struct ggml_init_params params = {
5150
- /*.mem_size =*/ buf_compute.size,
5151
- /*.mem_buffer =*/ buf_compute.data,
5152
- /*.no_alloc =*/ false,
5153
- };
5154
-
5155
- params.no_alloc = true;
5156
-
5157
- struct ggml_context * ctx0 = ggml_init(params);
5158
-
5159
- ggml_cgraph * gf = ggml_new_graph(ctx0);
5160
-
5161
- struct ggml_tensor * cur;
5162
- struct ggml_tensor * token;
5163
- struct ggml_tensor * inpL;
5164
-
5165
- if (batch.token) {
5166
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5167
-
5168
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
5169
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5170
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5171
- }
5172
- ggml_set_name(inp_tokens, "inp_tokens");
5173
-
5174
- token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5175
- } else {
5176
- #ifdef GGML_USE_MPI
5177
- GGML_ASSERT(false && "not implemented");
5178
- #endif
5179
-
5180
- token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5181
-
5182
- ggml_allocr_alloc(lctx.alloc, token);
5183
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5184
- memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
5185
- }
5186
- }
5187
-
5188
- // KQ_scale
5189
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5190
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5191
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
5192
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5193
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5194
- }
5195
 
5196
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5197
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5198
- ggml_set_name(KQ_mask, "KQ_mask");
5199
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
5200
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5201
- float * data = (float *) KQ_mask->data;
5202
- memset(data, 0, ggml_nbytes(KQ_mask));
5203
-
5204
- for (int h = 0; h < 1; ++h) {
5205
- for (int j = 0; j < n_tokens; ++j) {
5206
- const llama_pos pos = batch.pos[j];
5207
- const llama_seq_id seq_id = batch.seq_id[j];
5208
-
5209
- for (int i = 0; i < n_kv; ++i) {
5210
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5211
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5212
- }
5213
- }
5214
- }
5215
- }
5216
- }
5217
-
5218
- // norm
5219
- {
5220
- inpL = ggml_norm(ctx0, token, norm_eps);
5221
- inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
5222
- }
5223
-
5224
- ggml_set_name(inpL, "inpL");
5225
-
5226
- for (int il = 0; il < n_layer; ++il) {
5227
- {
5228
- // Norm
5229
- cur = ggml_norm(ctx0, inpL, norm_eps);
5230
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
5231
- }
5232
-
5233
- {
5234
- // Self Attention
5235
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
5236
-
5237
- struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
5238
- struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
5239
- struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
5240
-
5241
- struct ggml_tensor * Qcur = tmpq;
5242
- struct ggml_tensor * Kcur = tmpk;
5243
-
5244
- // store key and value to memory
5245
- {
5246
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5247
- ggml_set_name(Vcur, "Vcur");
5248
-
5249
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5250
- ggml_set_name(k, "k");
5251
-
5252
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5253
- ( n_ctx)*ggml_element_size(kv_self.v),
5254
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5255
-
5256
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5257
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5258
- }
5259
-
5260
- struct ggml_tensor * Q =
5261
- ggml_permute(ctx0,
5262
- ggml_cpy(ctx0,
5263
- Qcur,
5264
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
5265
- 0, 2, 1, 3);
5266
- ggml_set_name(Q, "Q");
5267
-
5268
- struct ggml_tensor * K =
5269
- ggml_view_3d(ctx0, kv_self.k,
5270
- n_embd_head, n_kv, n_head_kv,
5271
- ggml_element_size(kv_self.k)*n_embd_gqa,
5272
- ggml_element_size(kv_self.k)*n_embd_head,
5273
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5274
- ggml_set_name(K, "K");
5275
-
5276
- // K * Q
5277
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5278
- ggml_set_name(KQ, "KQ");
5279
-
5280
- // KQ_scaled = KQ / sqrt(n_embd_head)
5281
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
5282
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
5283
- ggml_set_name(KQ_scaled, "KQ_scaled");
5284
-
5285
- struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
5286
- ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5287
-
5288
- // KQ_masked = mask_past(KQ_scaled)
5289
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5290
- ggml_set_name(KQ_masked, "KQ_masked");
5291
-
5292
- // KQ = soft_max(KQ_masked)
5293
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
5294
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
5295
-
5296
- // split cached V into n_head heads
5297
- struct ggml_tensor * V =
5298
- ggml_view_3d(ctx0, kv_self.v,
5299
- n_kv, n_embd_head, n_head_kv,
5300
- ggml_element_size(kv_self.v)*n_ctx,
5301
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5302
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5303
- ggml_set_name(V, "V");
5304
-
5305
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5306
- ggml_set_name(KQV, "KQV");
5307
-
5308
- // KQV_merged = KQV.permute(0, 2, 1, 3)
5309
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5310
- ggml_set_name(KQV_merged, "KQV_merged");
5311
-
5312
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
5313
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5314
- ggml_set_name(cur, "KQV_merged_contiguous");
5315
- }
5316
-
5317
- // Projection
5318
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
5319
-
5320
- // Add the input
5321
- cur = ggml_add(ctx0, cur, inpL);
5322
-
5323
- struct ggml_tensor * inpFF = cur;
5324
-
5325
- // FF
5326
- {
5327
- // Norm
5328
- {
5329
- cur = ggml_norm(ctx0, inpFF, norm_eps);
5330
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
5331
- }
5332
-
5333
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
5334
-
5335
- // GELU activation
5336
- cur = ggml_gelu(ctx0, cur);
5337
-
5338
- // Projection
5339
- cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
5340
- }
5341
-
5342
- inpL = ggml_add(ctx0, cur, inpFF);
5343
- }
5344
-
5345
- // Output Norm
5346
- {
5347
- cur = ggml_norm(ctx0, inpL, norm_eps);
5348
- cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
5349
- }
5350
- ggml_set_name(cur, "result_norm");
5351
-
5352
- cur = ggml_mul_mat(ctx0, model.output, cur);
5353
- ggml_set_name(cur, "result_output");
5354
-
5355
- ggml_build_forward_expand(gf, cur);
5356
-
5357
- ggml_free(ctx0);
5358
-
5359
- return gf;
5360
- }
5361
-
5362
- static struct ggml_cgraph * llm_build_mpt(
5363
- llama_context & lctx,
5364
- const llama_batch & batch) {
5365
- const auto & model = lctx.model;
5366
- const auto & hparams = model.hparams;
5367
- const auto & cparams = lctx.cparams;
5368
-
5369
- const auto & kv_self = lctx.kv_self;
5370
-
5371
- GGML_ASSERT(!!kv_self.ctx);
5372
-
5373
- const int64_t n_embd = hparams.n_embd;
5374
- const int64_t n_layer = hparams.n_layer;
5375
- const int64_t n_ctx = cparams.n_ctx;
5376
- const int64_t n_head = hparams.n_head;
5377
- const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
5378
- const int64_t n_embd_head = hparams.n_embd_head();
5379
- const int64_t n_embd_gqa = hparams.n_embd_gqa();
5380
-
5381
- const float norm_eps = hparams.f_norm_eps;
5382
- const float clamp_kqv = hparams.f_clamp_kqv;
5383
- const float max_alibi_bias = hparams.f_max_alibi_bias;
5384
-
5385
- const int n_gpu_layers = model.n_gpu_layers;
5386
-
5387
- const int32_t n_tokens = batch.n_tokens;
5388
- const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
5389
- const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
5390
-
5391
- auto & buf_compute = lctx.buf_compute;
5392
-
5393
- struct ggml_init_params params = {
5394
- /*.mem_size =*/ buf_compute.size,
5395
- /*.mem_buffer =*/ buf_compute.data,
5396
- /*.no_alloc =*/ false,
5397
- };
5398
-
5399
- params.no_alloc = true;
5400
-
5401
- struct ggml_context * ctx0 = ggml_init(params);
5402
-
5403
- ggml_cgraph * gf = ggml_new_graph(ctx0);
5404
-
5405
- struct ggml_tensor * cur;
5406
- struct ggml_tensor * inpL;
5407
-
5408
- //int warmup = 0;
5409
- if (batch.token) {
5410
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5411
-
5412
- ggml_allocr_alloc(lctx.alloc, inp_tokens);
5413
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5414
- memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
5415
- //warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
5416
- }
5417
-
5418
- ggml_set_name(inp_tokens, "inp_tokens");
5419
-
5420
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
5421
- } else {
5422
- #ifdef GGML_USE_MPI
5423
- GGML_ASSERT(false && "not implemented");
5424
- #endif
5425
-
5426
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
5427
-
5428
- ggml_allocr_alloc(lctx.alloc, inpL);
5429
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5430
- memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
5431
- }
5432
- }
5433
-
5434
- const int i_gpu_start = n_layer - n_gpu_layers;
5435
- (void) i_gpu_start;
5436
-
5437
- // offload functions set the tensor output backend to GPU
5438
- // tensors are GPU-accelerated if any input or the output has been offloaded
5439
- offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
5440
- offload_func_t offload_func_kq = llama_nop;
5441
- offload_func_t offload_func_v = llama_nop;
5442
-
5443
- #ifdef GGML_USE_CUBLAS
5444
- if (n_gpu_layers > n_layer) {
5445
- offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
5446
- }
5447
- if (n_gpu_layers > n_layer + 1) {
5448
- offload_func_v = ggml_cuda_assign_buffers_no_alloc;
5449
- }
5450
- if (n_gpu_layers > n_layer + 2) {
5451
- offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
5452
- }
5453
- #endif // GGML_USE_CUBLAS
5454
-
5455
- // KQ_scale
5456
- struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5457
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
5458
- ggml_allocr_alloc(lctx.alloc, KQ_scale);
5459
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5460
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
5461
- }
5462
-
5463
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5464
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5465
- offload_func_kq(KQ_mask);
5466
- ggml_set_name(KQ_mask, "KQ_mask");
5467
- ggml_allocr_alloc(lctx.alloc, KQ_mask);
5468
- if (!ggml_allocr_is_measure(lctx.alloc)) {
5469
- float * data = (float *) KQ_mask->data;
5470
- memset(data, 0, ggml_nbytes(KQ_mask));
5471
-
5472
- for (int h = 0; h < 1; ++h) {
5473
- for (int j = 0; j < n_tokens; ++j) {
5474
- const llama_pos pos = batch.pos[j];
5475
- const llama_seq_id seq_id = batch.seq_id[j];
5476
-
5477
- for (int i = 0; i < n_kv; ++i) {
5478
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
5479
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
5480
- }
5481
- }
5482
- }
5483
- }
5484
- }
5485
-
5486
- for (int il = 0; il < n_layer; ++il) {
5487
- struct ggml_tensor * attn_norm;
5488
-
5489
- offload_func_t offload_func = llama_nop;
5490
-
5491
- #ifdef GGML_USE_CUBLAS
5492
- if (il >= i_gpu_start) {
5493
- offload_func = ggml_cuda_assign_buffers_no_alloc;
5494
- }
5495
- #endif // GGML_USE_CUBLAS
5496
-
5497
- // self-attention
5498
- // TODO: refactor into common function (shared with LLaMA)
5499
- {
5500
- attn_norm = ggml_norm(ctx0, inpL, norm_eps);
5501
- offload_func(attn_norm);
5502
-
5503
- attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
5504
- offload_func(attn_norm);
5505
-
5506
- if (1) {
5507
- cur = attn_norm;
5508
- }
5509
-
5510
- // compute QKV
5511
-
5512
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5513
- offload_func_kq(cur);
5514
-
5515
- if (clamp_kqv > 0.0f) {
5516
- cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
5517
- offload_func_kq(cur);
5518
- }
5519
-
5520
- const size_t wsize = ggml_type_size(cur->type);
5521
-
5522
- struct ggml_tensor * Qcur = ggml_view_3d(
5523
- ctx0, cur, n_embd_head, n_head, n_tokens,
5524
- wsize * n_embd_head,
5525
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
5526
- 0);
5527
- offload_func_kq(Qcur);
5528
-
5529
- struct ggml_tensor * Kcur = ggml_view_3d(
5530
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5531
- wsize * n_embd_head,
5532
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
5533
- wsize * n_embd_head * n_head);
5534
- offload_func_kq(Kcur);
5535
-
5536
- struct ggml_tensor * tmpv = ggml_view_3d(
5537
- ctx0, cur, n_embd_head, n_head_kv, n_tokens,
5538
- wsize * n_embd_head,
5539
- wsize * n_embd_head * (n_head + 2 * n_head_kv),
5540
- wsize * n_embd_head * (n_head + n_head_kv));
5541
- offload_func_kq(Kcur);
5542
-
5543
- ggml_set_name(Qcur, "Qcur");
5544
- ggml_set_name(Kcur, "Kcur");
5545
-
5546
- {
5547
- struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
5548
- offload_func_v(Vcur);
5549
- offload_func_v(Vcur->src[0]->src[0]);
5550
- ggml_set_name(Vcur, "Vcur");
5551
-
5552
- struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
5553
- offload_func_kq(k);
5554
- ggml_set_name(k, "k");
5555
-
5556
- struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
5557
- ( n_ctx)*ggml_element_size(kv_self.v),
5558
- (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
5559
- offload_func_v(v);
5560
-
5561
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
5562
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
5563
- }
5564
-
5565
- struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
5566
- offload_func_kq(Q);
5567
- ggml_set_name(Q, "Q");
5568
-
5569
- struct ggml_tensor * K =
5570
- ggml_view_3d(ctx0, kv_self.k,
5571
- n_embd_head, n_kv, n_head_kv,
5572
- ggml_element_size(kv_self.k)*n_embd_gqa,
5573
- ggml_element_size(kv_self.k)*n_embd_head,
5574
- ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
5575
- offload_func_kq(K);
5576
- ggml_set_name(K, "K");
5577
-
5578
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
5579
- offload_func_kq(KQ);
5580
- ggml_set_name(KQ, "KQ");
5581
-
5582
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
5583
- offload_func_kq(KQ_scaled);
5584
- ggml_set_name(KQ_scaled, "KQ_scaled");
5585
-
5586
- // TODO: replace with ggml_add()
5587
- struct ggml_tensor * KQ_scaled_alibi =
5588
- ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
5589
- offload_func_kq(KQ_scaled_alibi);
5590
- ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
5591
-
5592
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
5593
- offload_func_kq(KQ_masked);
5594
- ggml_set_name(KQ_masked, "KQ_masked");
5595
-
5596
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
5597
- offload_func_v(KQ_soft_max);
5598
- ggml_set_name(KQ_soft_max, "KQ_soft_max");
5599
-
5600
- struct ggml_tensor * V =
5601
- ggml_view_3d(ctx0, kv_self.v,
5602
- n_kv, n_embd_head, n_head_kv,
5603
- ggml_element_size(kv_self.v)*n_ctx,
5604
- ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
5605
- ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
5606
- offload_func_v(V);
5607
- ggml_set_name(V, "V");
5608
-
5609
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
5610
- offload_func_v(KQV);
5611
- ggml_set_name(KQV, "KQV");
5612
-
5613
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
5614
- offload_func_v(KQV_merged);
5615
- ggml_set_name(KQV_merged, "KQV_merged");
5616
-
5617
- cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
5618
- offload_func_v(cur);
5619
- ggml_set_name(cur, "KQV_merged_contiguous");
5620
-
5621
- cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
5622
- offload_func(cur);
5623
- ggml_set_name(cur, "result_wo");
5624
- }
5625
-
5626
- // Add the input
5627
- cur = ggml_add(ctx0, cur, inpL);
5628
- offload_func(cur);
5629
-
5630
- struct ggml_tensor * attn_out = cur;
5631
-
5632
- // feed forward
5633
- {
5634
- // Norm
5635
- {
5636
- cur = ggml_norm(ctx0, attn_out, norm_eps);
5637
- offload_func(cur);
5638
-
5639
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
5640
- offload_func(cur);
5641
- }
5642
-
5643
- cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
5644
- offload_func(cur);
5645
-
5646
- cur = ggml_gelu(ctx0, cur);
5647
- offload_func(cur);
5648
- cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
5649
- offload_func(cur);
5650
- }
5651
-
5652
- cur = ggml_add(ctx0, cur, attn_out);
5653
- offload_func(cur);
5654
- // input for next layer
5655
- inpL = cur;
5656
- }
5657
-
5658
- cur = inpL;
5659
-
5660
- // norm
5661
- {
5662
- cur = ggml_norm(ctx0, cur, norm_eps);
5663
- offload_func_nr(cur);
5664
-
5665
- cur = ggml_mul(ctx0, cur, model.output_norm);
5666
- ggml_set_name(cur, "result_norm");
5667
- }
5668
-
5669
- cur = ggml_mul_mat(ctx0, model.output, cur);
5670
- ggml_set_name(cur, "result_output");
5671
-
5672
- ggml_build_forward_expand(gf, cur);
5673
-
5674
- ggml_free(ctx0);
5675
-
5676
- return gf;
5677
- }
5678
-
5679
- static struct ggml_cgraph * llama_build_graph(
5680
- llama_context & lctx,
5681
- const llama_batch & batch) {
5682
- const auto & model = lctx.model;
5683
-
5684
- struct ggml_cgraph * result = NULL;
5685
 
5686
  switch (model.arch) {
5687
  case LLM_ARCH_LLAMA:
@@ -5708,14 +4937,6 @@ static struct ggml_cgraph * llama_build_graph(
5708
  {
5709
  result = llm_build_refact(lctx, batch);
5710
  } break;
5711
- case LLM_ARCH_BLOOM:
5712
- {
5713
- result = llm_build_bloom(lctx, batch);
5714
- } break;
5715
- case LLM_ARCH_MPT:
5716
- {
5717
- result = llm_build_mpt(lctx, batch);
5718
- } break;
5719
  default:
5720
  GGML_ASSERT(false);
5721
  }
@@ -5846,8 +5067,7 @@ static int llama_decode_internal(
5846
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
5847
  model.arch == LLM_ARCH_BAICHUAN ||
5848
  model.arch == LLM_ARCH_FALCON ||
5849
- model.arch == LLM_ARCH_REFACT ||
5850
- model.arch == LLM_ARCH_MPT;
5851
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5852
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5853
  n_threads = 1;
@@ -6348,6 +5568,7 @@ private:
6348
  for (int i = 0; i < (int)text_utf.size(); i++) {
6349
  const std::string & utf_char = text_utf[i];
6350
  bool split_condition = false;
 
6351
  int bytes_remain = text_utf.size() - i;
6352
  // forward backward lookups
6353
  const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@@ -6373,9 +5594,9 @@ private:
6373
  if (!split_condition && bytes_remain >= 3) {
6374
  // 're|'ve|'ll
6375
  if (utf_char == "\'" && (
6376
- (utf_char_next == "r" && utf_char_next_next == "e") ||
6377
- (utf_char_next == "v" && utf_char_next_next == "e") ||
6378
- (utf_char_next == "l" && utf_char_next_next == "l"))
6379
  ) {
6380
  split_condition = true;
6381
  }
@@ -6426,7 +5647,7 @@ private:
6426
  else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
6427
  split_condition = true;
6428
  }
6429
- else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
6430
  split_condition = true;
6431
  }
6432
  }
@@ -7945,7 +7166,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
7945
  const std::string name = ggml_get_name(meta);
7946
 
7947
  // TODO: avoid hardcoded tensor names - use the TN_* constants
7948
- if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
7949
  ++n_attention_wv;
7950
  }
7951
  else if (name.find("ffn_down.weight") != std::string::npos) {
 
189
  LLM_ARCH_STARCODER,
190
  LLM_ARCH_PERSIMMON,
191
  LLM_ARCH_REFACT,
 
192
  LLM_ARCH_UNKNOWN,
193
  };
194
 
 
202
  { LLM_ARCH_BAICHUAN, "baichuan" },
203
  { LLM_ARCH_STARCODER, "starcoder" },
204
  { LLM_ARCH_PERSIMMON, "persimmon" },
205
+ { LLM_ARCH_REFACT, "refact" },
 
206
  };
207
 
208
  enum llm_kv {
 
305
 
306
  enum llm_tensor {
307
  LLM_TENSOR_TOKEN_EMBD,
 
308
  LLM_TENSOR_POS_EMBD,
309
  LLM_TENSOR_OUTPUT,
310
  LLM_TENSOR_OUTPUT_NORM,
 
425
  LLM_ARCH_MPT,
426
  {
427
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
 
 
 
 
 
 
 
 
428
  },
429
  },
430
  {
 
459
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
460
  },
461
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  {
463
  LLM_ARCH_UNKNOWN,
464
  {
 
1016
  float rope_freq_base_train;
1017
  float rope_freq_scale_train;
1018
 
 
 
 
1019
  bool operator!=(const llama_hparams & other) const {
1020
  if (this->vocab_only != other.vocab_only) return true;
1021
  if (this->n_vocab != other.n_vocab) return true;
 
1201
 
1202
  struct ggml_tensor * tok_embeddings;
1203
  struct ggml_tensor * pos_embeddings;
 
 
1204
 
1205
  struct ggml_tensor * output_norm;
1206
  struct ggml_tensor * output_norm_b;
 
1330
  cache.cells.clear();
1331
  cache.cells.resize(n_ctx);
1332
 
 
 
 
1333
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
 
1334
 
1335
  struct ggml_init_params params;
1336
  params.mem_size = cache.buf.size;
 
1736
  }
1737
  }
1738
 
1739
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
1740
  if (backend != GGML_BACKEND_CPU) {
1741
  ggml_set_no_alloc(ctx, true);
1742
  }
 
1754
  return tensor;
1755
  }
1756
 
1757
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
1758
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
1759
 
1760
  if (cur == NULL) {
 
2047
  }
2048
  } break;
2049
  case LLM_ARCH_PERSIMMON:
2050
+ {
2051
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
2052
+ switch (hparams.n_layer) {
2053
+ case 36: model.type = e_model::MODEL_8B; break;
2054
+ default: model.type = e_model::MODEL_UNKNOWN;
2055
+ }
2056
+ } break;
2057
  case LLM_ARCH_REFACT:
2058
  {
2059
  GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
 
2062
  default: model.type = e_model::MODEL_UNKNOWN;
2063
  }
2064
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2065
  default: (void)0;
2066
  }
2067
 
 
2206
  LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
2207
  LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
2208
  LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
 
 
2209
  LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
2210
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
2211
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
 
2305
 
2306
  // output
2307
  {
2308
+ ggml_backend backend_norm;
2309
+ ggml_backend backend_output;
2310
 
2311
  if (n_gpu_layers > int(n_layer)) {
2312
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2341
  model.layers.resize(n_layer);
2342
 
2343
  for (uint32_t i = 0; i < n_layer; ++i) {
2344
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2345
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2346
 
2347
  auto & layer = model.layers[i];
2348
 
 
2371
  {
2372
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2373
  {
2374
+ ggml_backend backend_norm;
2375
+ ggml_backend backend_output;
2376
 
2377
  if (n_gpu_layers > int(n_layer)) {
2378
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2407
  model.layers.resize(n_layer);
2408
 
2409
  for (uint32_t i = 0; i < n_layer; ++i) {
2410
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2411
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2412
 
2413
  auto & layer = model.layers[i];
2414
 
 
2441
 
2442
  // output
2443
  {
2444
+ ggml_backend backend_norm;
2445
+ ggml_backend backend_output;
2446
 
2447
  if (n_gpu_layers > int(n_layer)) {
2448
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2479
  model.layers.resize(n_layer);
2480
 
2481
  for (uint32_t i = 0; i < n_layer; ++i) {
2482
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2483
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2484
 
2485
  auto & layer = model.layers[i];
2486
 
 
2518
 
2519
  // output
2520
  {
2521
+ ggml_backend backend_norm;
2522
+ ggml_backend backend_output;
2523
 
2524
  if (n_gpu_layers > int(n_layer)) {
2525
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2556
  model.layers.resize(n_layer);
2557
 
2558
  for (uint32_t i = 0; i < n_layer; ++i) {
2559
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2560
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2561
 
2562
  auto & layer = model.layers[i];
2563
 
 
2595
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2596
 
2597
  {
2598
+ ggml_backend backend_norm;
2599
+ ggml_backend backend_output;
2600
 
2601
  if (n_gpu_layers > int(n_layer)) {
2602
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
 
2630
  const int i_gpu_start = n_layer - n_gpu_layers;
2631
  model.layers.resize(n_layer);
2632
  for (uint32_t i = 0; i < n_layer; ++i) {
2633
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2634
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2635
  auto & layer = model.layers[i];
2636
  layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2637
  layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
 
2651
  layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
2652
  }
2653
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2654
  default:
2655
  throw std::runtime_error("unknown architecture");
2656
  }
 
4507
  return gf;
4508
  }
4509
 
4510
+
4511
  static struct ggml_cgraph * llm_build_persimmon(
4512
  llama_context & lctx,
4513
  const llama_batch & batch) {
 
4905
  return gf;
4906
  }
4907
 
4908
+ static struct ggml_cgraph * llama_build_graph(
4909
  llama_context & lctx,
4910
  const llama_batch & batch) {
4911
+ const auto & model = lctx.model;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4912
 
4913
+ struct ggml_cgraph * result = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4914
 
4915
  switch (model.arch) {
4916
  case LLM_ARCH_LLAMA:
 
4937
  {
4938
  result = llm_build_refact(lctx, batch);
4939
  } break;
 
 
 
 
 
 
 
 
4940
  default:
4941
  GGML_ASSERT(false);
4942
  }
 
5067
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
5068
  model.arch == LLM_ARCH_BAICHUAN ||
5069
  model.arch == LLM_ARCH_FALCON ||
5070
+ model.arch == LLM_ARCH_REFACT;
 
5071
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5072
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5073
  n_threads = 1;
 
5568
  for (int i = 0; i < (int)text_utf.size(); i++) {
5569
  const std::string & utf_char = text_utf[i];
5570
  bool split_condition = false;
5571
+ // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5572
  int bytes_remain = text_utf.size() - i;
5573
  // forward backward lookups
5574
  const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
 
5594
  if (!split_condition && bytes_remain >= 3) {
5595
  // 're|'ve|'ll
5596
  if (utf_char == "\'" && (
5597
+ (utf_char_next == "r" || utf_char_next_next == "e") ||
5598
+ (utf_char_next == "v" || utf_char_next_next == "e") ||
5599
+ (utf_char_next == "l" || utf_char_next_next == "l"))
5600
  ) {
5601
  split_condition = true;
5602
  }
 
5647
  else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5648
  split_condition = true;
5649
  }
5650
+ else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
5651
  split_condition = true;
5652
  }
5653
  }
 
7166
  const std::string name = ggml_get_name(meta);
7167
 
7168
  // TODO: avoid hardcoded tensor names - use the TN_* constants
7169
+ if (name.find("attn_v.weight") != std::string::npos) {
7170
  ++n_attention_wv;
7171
  }
7172
  else if (name.find("ffn_down.weight") != std::string::npos) {
otherarch/llama_v3.cpp CHANGED
@@ -63,8 +63,9 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
63
  #define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
64
  #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
65
 
66
- #include "ggml-alloc.h"
67
  #if !defined(GGML_USE_CUBLAS)
 
68
  #define LLAMA_V3_USE_ALLOCATOR
69
  #else
70
  #define LLAMA_V3_USE_SCRATCH
@@ -724,7 +725,7 @@ struct llama_v3_model_loader {
724
  }
725
  }
726
 
727
- struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend_type backend) {
728
  auto it = tensors_map.name_to_idx.find(name);
729
  if (it == tensors_map.name_to_idx.end()) {
730
  throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
@@ -738,7 +739,7 @@ struct llama_v3_model_loader {
738
  return get_tensor_for(lt, backend);
739
  }
740
 
741
- struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend_type backend) {
742
  struct ggml_tensor * tensor;
743
  if (backend != GGML_BACKEND_CPU) {
744
  ggml_set_no_alloc(ggml_ctx, true);
@@ -1229,8 +1230,8 @@ static void llama_v3_model_load_internal(
1229
 
1230
  // "output" tensor
1231
  {
1232
- ggml_backend_type backend_norm;
1233
- ggml_backend_type backend_output;
1234
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1235
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1236
  // on Windows however this is detrimental unless everything is on the GPU
@@ -1260,8 +1261,8 @@ static void llama_v3_model_load_internal(
1260
 
1261
  model.layers.resize(n_layer);
1262
  for (uint32_t i = 0; i < n_layer; ++i) {
1263
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
1264
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
1265
 
1266
  auto & layer = model.layers[i];
1267
 
 
63
  #define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
64
  #define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
65
 
66
+
67
  #if !defined(GGML_USE_CUBLAS)
68
+ #include "ggml-alloc.h"
69
  #define LLAMA_V3_USE_ALLOCATOR
70
  #else
71
  #define LLAMA_V3_USE_SCRATCH
 
725
  }
726
  }
727
 
728
+ struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
729
  auto it = tensors_map.name_to_idx.find(name);
730
  if (it == tensors_map.name_to_idx.end()) {
731
  throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
 
739
  return get_tensor_for(lt, backend);
740
  }
741
 
742
+ struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend backend) {
743
  struct ggml_tensor * tensor;
744
  if (backend != GGML_BACKEND_CPU) {
745
  ggml_set_no_alloc(ggml_ctx, true);
 
1230
 
1231
  // "output" tensor
1232
  {
1233
+ ggml_backend backend_norm;
1234
+ ggml_backend backend_output;
1235
  if (n_gpu_layers > int(n_layer)) { // NOLINT
1236
  // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
1237
  // on Windows however this is detrimental unless everything is on the GPU
 
1261
 
1262
  model.layers.resize(n_layer);
1263
  for (uint32_t i = 0; i < n_layer; ++i) {
1264
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
1265
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
1266
 
1267
  auto & layer = model.layers[i];
1268
 
spm-headers/ggml.h CHANGED
@@ -326,7 +326,7 @@ extern "C" {
326
  GGML_TYPE_COUNT,
327
  };
328
 
329
- enum ggml_backend_type {
330
  GGML_BACKEND_CPU = 0,
331
  GGML_BACKEND_GPU = 10,
332
  GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,10 +479,8 @@ extern "C" {
479
 
480
  // n-dimensional tensor
481
  struct ggml_tensor {
482
- enum ggml_type type;
483
- enum ggml_backend_type backend;
484
-
485
- struct ggml_backend_buffer * buffer;
486
 
487
  int n_dims;
488
  int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -516,7 +514,7 @@ extern "C" {
516
 
517
  void * extra; // extra things e.g. for ggml-cuda.cu
518
 
519
- char padding[12];
520
  };
521
 
522
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -1360,7 +1358,7 @@ extern "C" {
1360
 
1361
  // alibi position embedding
1362
  // in-place, returns view(a)
1363
- GGML_API struct ggml_tensor * ggml_alibi(
1364
  struct ggml_context * ctx,
1365
  struct ggml_tensor * a,
1366
  int n_past,
@@ -1369,7 +1367,7 @@ extern "C" {
1369
 
1370
  // clamp
1371
  // in-place, returns view(a)
1372
- GGML_API struct ggml_tensor * ggml_clamp(
1373
  struct ggml_context * ctx,
1374
  struct ggml_tensor * a,
1375
  float min,
@@ -2104,7 +2102,7 @@ extern "C" {
2104
  enum ggml_type vec_dot_type;
2105
  } ggml_type_traits_t;
2106
 
2107
- GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2108
 
2109
  #ifdef __cplusplus
2110
  }
 
326
  GGML_TYPE_COUNT,
327
  };
328
 
329
+ enum ggml_backend {
330
  GGML_BACKEND_CPU = 0,
331
  GGML_BACKEND_GPU = 10,
332
  GGML_BACKEND_GPU_SPLIT = 20,
 
479
 
480
  // n-dimensional tensor
481
  struct ggml_tensor {
482
+ enum ggml_type type;
483
+ enum ggml_backend backend;
 
 
484
 
485
  int n_dims;
486
  int64_t ne[GGML_MAX_DIMS]; // number of elements
 
514
 
515
  void * extra; // extra things e.g. for ggml-cuda.cu
516
 
517
+ char padding[4];
518
  };
519
 
520
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
1358
 
1359
  // alibi position embedding
1360
  // in-place, returns view(a)
1361
+ struct ggml_tensor * ggml_alibi(
1362
  struct ggml_context * ctx,
1363
  struct ggml_tensor * a,
1364
  int n_past,
 
1367
 
1368
  // clamp
1369
  // in-place, returns view(a)
1370
+ struct ggml_tensor * ggml_clamp(
1371
  struct ggml_context * ctx,
1372
  struct ggml_tensor * a,
1373
  float min,
 
2102
  enum ggml_type vec_dot_type;
2103
  } ggml_type_traits_t;
2104
 
2105
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2106
 
2107
  #ifdef __cplusplus
2108
  }
tests/test-tokenizer-0-falcon.cpp CHANGED
@@ -36,8 +36,6 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
36
  { " Hello" , { 258, 23090, }, },
37
  { " Hello" , { 466, 23090, }, },
38
  { " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
39
- { "\n =" , { 1212, 40, }, },
40
- { "' era" , { 18, 4932, }, },
41
  };
42
 
43
  return _k_tests;
@@ -157,7 +155,7 @@ int main(int argc, char **argv) {
157
 
158
  fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
159
 
160
- const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
161
 
162
  fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
163
 
@@ -171,8 +169,10 @@ int main(int argc, char **argv) {
171
  }
172
 
173
  for (const auto & tok : res) {
174
- ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
175
  }
 
 
176
  }
177
 
178
  fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
 
36
  { " Hello" , { 258, 23090, }, },
37
  { " Hello" , { 466, 23090, }, },
38
  { " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
 
 
39
  };
40
 
41
  return _k_tests;
 
155
 
156
  fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
157
 
158
+ const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
159
 
160
  fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
161
 
 
169
  }
170
 
171
  for (const auto & tok : res) {
172
+ ofs << tok << " ";
173
  }
174
+
175
+ ofs << "\n";
176
  }
177
 
178
  fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
tests/test-tokenizer-0-falcon.py CHANGED
@@ -41,8 +41,6 @@ tests = [
41
  " Hello",
42
  " Hello",
43
  " Hello\n Hello",
44
- "\n =",
45
- "' era",
46
  ]
47
 
48
  for text in tests:
@@ -71,14 +69,15 @@ fname_tok = args.fname_tok
71
  if fname_tok:
72
  print('tokenizing file: ', fname_tok)
73
  fname_out = fname_tok + '.tok'
74
- with open(fname_tok, 'r', encoding='utf-8') as f:
75
  lines = f.readlines()
76
  s = ''.join(lines)
77
  res = tokenizer.encode(s)
78
  # write to file
79
- with open(fname_out, 'w', encoding='utf-8') as f:
80
  for x in res:
81
- f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
 
82
  print('len(res): ', len(res))
83
  print('len(lines): ', len(lines))
84
  print('results written to: ', fname_out)
 
41
  " Hello",
42
  " Hello",
43
  " Hello\n Hello",
 
 
44
  ]
45
 
46
  for text in tests:
 
69
  if fname_tok:
70
  print('tokenizing file: ', fname_tok)
71
  fname_out = fname_tok + '.tok'
72
+ with open(fname_tok, 'r') as f:
73
  lines = f.readlines()
74
  s = ''.join(lines)
75
  res = tokenizer.encode(s)
76
  # write to file
77
+ with open(fname_out, 'w') as f:
78
  for x in res:
79
+ f.write(str(x) + ' ')
80
+ f.write('\n')
81
  print('len(res): ', len(res))
82
  print('len(lines): ', len(lines))
83
  print('results written to: ', fname_out)
tests/test-tokenizer-0-llama.cpp CHANGED
@@ -174,8 +174,10 @@ int main(int argc, char **argv) {
174
  }
175
 
176
  for (const auto & tok : res) {
177
- ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
178
  }
 
 
179
  }
180
 
181
  fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
 
174
  }
175
 
176
  for (const auto & tok : res) {
177
+ ofs << tok << " ";
178
  }
179
+
180
+ ofs << "\n";
181
  }
182
 
183
  fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
tests/test-tokenizer-0-llama.py CHANGED
@@ -81,14 +81,15 @@ fname_tok = args.fname_tok
81
  if fname_tok:
82
  print('tokenizing file: ', fname_tok)
83
  fname_out = fname_tok + '.tok'
84
- with open(fname_tok, 'r', encoding='utf-8') as f:
85
  lines = f.readlines()
86
  s = ''.join(lines)
87
  res = tokenizer.encode(s, add_bos=True)
88
  # write to file
89
- with open(fname_out, 'w', encoding='utf-8') as f:
90
  for x in res:
91
- f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
 
92
  print('len(res): ', len(res))
93
  print('len(lines): ', len(lines))
94
  print('results written to: ', fname_out)
 
81
  if fname_tok:
82
  print('tokenizing file: ', fname_tok)
83
  fname_out = fname_tok + '.tok'
84
+ with open(fname_tok, 'r') as f:
85
  lines = f.readlines()
86
  s = ''.join(lines)
87
  res = tokenizer.encode(s, add_bos=True)
88
  # write to file
89
+ with open(fname_out, 'w') as f:
90
  for x in res:
91
+ f.write(str(x) + ' ')
92
+ f.write('\n')
93
  print('len(res): ', len(res))
94
  print('len(lines): ', len(lines))
95
  print('results written to: ', fname_out)