Illumotion commited on
Commit
dc53b3a
1 Parent(s): 9938c27

Upload folder using huggingface_hub

Browse files
.devops/full-cuda.Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+
3
+ # This needs to generally match the container host's environment.
4
+ ARG CUDA_VERSION=11.7.1
5
+
6
+ # Target the CUDA build image
7
+ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
8
+
9
+ FROM ${BASE_CUDA_DEV_CONTAINER} as build
10
+
11
+ # Unless otherwise specified, we make a fat build.
12
+ ARG CUDA_DOCKER_ARCH=all
13
+
14
+ RUN apt-get update && \
15
+ apt-get install -y build-essential python3 python3-pip
16
+
17
+ COPY requirements.txt requirements.txt
18
+
19
+ RUN pip install --upgrade pip setuptools wheel \
20
+ && pip install -r requirements.txt
21
+
22
+ WORKDIR /app
23
+
24
+ COPY . .
25
+
26
+ # Set nvcc architecture
27
+ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
28
+ # Enable cuBLAS
29
+ ENV LLAMA_CUBLAS=1
30
+
31
+ RUN make
32
+
33
+ ENTRYPOINT ["/app/.devops/tools.sh"]
.devops/main-cuda.Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG UBUNTU_VERSION=22.04
2
+ # This needs to generally match the container host's environment.
3
+ ARG CUDA_VERSION=11.7.1
4
+ # Target the CUDA build image
5
+ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6
+ # Target the CUDA runtime image
7
+ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8
+
9
+ FROM ${BASE_CUDA_DEV_CONTAINER} as build
10
+
11
+ # Unless otherwise specified, we make a fat build.
12
+ ARG CUDA_DOCKER_ARCH=all
13
+
14
+ RUN apt-get update && \
15
+ apt-get install -y build-essential
16
+
17
+ WORKDIR /app
18
+
19
+ COPY . .
20
+
21
+ # Set nvcc architecture
22
+ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
23
+ # Enable cuBLAS
24
+ ENV LLAMA_CUBLAS=1
25
+
26
+ RUN make
27
+
28
+ FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
29
+
30
+ COPY --from=build /app/main /main
31
+
32
+ ENTRYPOINT [ "/main" ]
.gitignore CHANGED
@@ -20,6 +20,7 @@ build-static/
20
  build-cublas/
21
  build-opencl/
22
  build-metal/
 
23
  build-no-accel/
24
  build-sanitize-addr/
25
  build-sanitize-thread/
@@ -67,4 +68,6 @@ koboldcpp_failsafe.dll
67
  koboldcpp_openblas.dll
68
  koboldcpp_openblas_noavx2.dll
69
  koboldcpp_clblast.dll
70
- koboldcpp_cublas.dll
 
 
 
20
  build-cublas/
21
  build-opencl/
22
  build-metal/
23
+ build-mpi/
24
  build-no-accel/
25
  build-sanitize-addr/
26
  build-sanitize-thread/
 
68
  koboldcpp_openblas.dll
69
  koboldcpp_openblas_noavx2.dll
70
  koboldcpp_clblast.dll
71
+ koboldcpp_cublas.dll
72
+ cublas64_11.dll
73
+ cublasLt64_11.dll
CMakeLists.txt CHANGED
@@ -28,6 +28,8 @@ set(LLAMA_SANITIZE_THREAD OFF)
28
  set(LLAMA_SANITIZE_ADDRESS OFF)
29
  set(LLAMA_SANITIZE_UNDEFINED OFF)
30
 
 
 
31
  # instruction set specific
32
  option(LLAMA_AVX "llama: enable AVX" ON)
33
  option(LLAMA_AVX2 "llama: enable AVX2" ON)
@@ -44,6 +46,7 @@ endif()
44
  option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
45
  set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
46
  set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
 
47
  option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
48
  set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
49
  option(LLAMA_K_QUANTS "llama: use k-quants" ON)
@@ -71,13 +74,16 @@ if (LLAMA_CUBLAS)
71
 
72
  enable_language(CUDA)
73
 
74
- set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
75
  set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
76
  set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
77
 
78
  add_compile_definitions(GGML_USE_CUBLAS)
 
 
79
  add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
80
  add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
 
81
  if (LLAMA_CUDA_DMMV_F16)
82
  add_compile_definitions(GGML_CUDA_DMMV_F16)
83
  endif()
@@ -89,6 +95,15 @@ if (LLAMA_CUBLAS)
89
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
90
  endif()
91
 
 
 
 
 
 
 
 
 
 
92
  else()
93
  message(WARNING "cuBLAS not found")
94
  endif()
@@ -246,7 +261,7 @@ add_library(ggml OBJECT
246
  ggml.h
247
  k_quants.h
248
  k_quants.c
249
- ${GGML_CUDA_SOURCES})
250
  target_include_directories(ggml PUBLIC . ./otherarch ./otherarch/tools)
251
  target_compile_features(ggml PUBLIC c_std_11) # don't bump
252
  target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
@@ -286,12 +301,6 @@ target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
286
  set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
287
 
288
 
289
- if (GGML_CUDA_SOURCES)
290
- message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
291
- set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
292
- set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
293
- endif()
294
-
295
  set(TARGET koboldcpp_cublas)
296
  add_library(${TARGET} SHARED expose.cpp expose.h)
297
  target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples)
@@ -301,3 +310,19 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
301
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
302
  target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
303
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  set(LLAMA_SANITIZE_ADDRESS OFF)
29
  set(LLAMA_SANITIZE_UNDEFINED OFF)
30
 
31
+ option(MAKE_MISC_FILES "MAKE_MISC_FILES" OFF)
32
+
33
  # instruction set specific
34
  option(LLAMA_AVX "llama: enable AVX" ON)
35
  option(LLAMA_AVX2 "llama: enable AVX2" ON)
 
46
  option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
47
  set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
48
  set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
49
+ set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
50
  option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
51
  set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
52
  option(LLAMA_K_QUANTS "llama: use k-quants" ON)
 
74
 
75
  enable_language(CUDA)
76
 
77
+ set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
78
  set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
79
  set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
80
 
81
  add_compile_definitions(GGML_USE_CUBLAS)
82
+ #add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
83
+
84
  add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
85
  add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
86
+ add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
87
  if (LLAMA_CUDA_DMMV_F16)
88
  add_compile_definitions(GGML_CUDA_DMMV_F16)
89
  endif()
 
95
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
96
  endif()
97
 
98
+ if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
99
+ if (LLAMA_CUDA_DMMV_F16)
100
+ set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
101
+ else()
102
+ set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
103
+ endif()
104
+ endif()
105
+ message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
106
+
107
  else()
108
  message(WARNING "cuBLAS not found")
109
  endif()
 
261
  ggml.h
262
  k_quants.h
263
  k_quants.c
264
+ ${GGML_SOURCES_CUDA})
265
  target_include_directories(ggml PUBLIC . ./otherarch ./otherarch/tools)
266
  target_compile_features(ggml PUBLIC c_std_11) # don't bump
267
  target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 
301
  set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
302
 
303
 
 
 
 
 
 
 
304
  set(TARGET koboldcpp_cublas)
305
  add_library(${TARGET} SHARED expose.cpp expose.h)
306
  target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples)
 
310
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
311
  target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
312
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
313
+
314
+
315
+ if (MAKE_MISC_FILES)
316
+ add_library(llama
317
+ llama.cpp
318
+ llama.h
319
+ llama-util.h
320
+ )
321
+ target_include_directories(llama PUBLIC .)
322
+ target_compile_features(llama PUBLIC cxx_std_11) # don't bump
323
+ target_link_libraries(llama PRIVATE
324
+ ggml
325
+ ${LLAMA_EXTRA_LIBS}
326
+ )
327
+ add_subdirectory(examples)
328
+ endif()
Makefile CHANGED
@@ -144,17 +144,27 @@ ifdef LLAMA_CUBLAS
144
  CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
145
  CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
146
  NVCC = nvcc
147
- NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
 
 
 
 
 
 
 
 
148
  ifdef LLAMA_CUDA_DMMV_X
149
  NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
150
  else
151
  NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
152
  endif # LLAMA_CUDA_DMMV_X
153
- ifdef LLAMA_CUDA_DMMV_Y
154
- NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
 
 
155
  else
156
- NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
157
- endif # LLAMA_CUDA_DMMV_Y
158
  ifdef LLAMA_CUDA_DMMV_F16
159
  NVCCFLAGS += -DGGML_CUDA_DMMV_F16
160
  endif # LLAMA_CUDA_DMMV_F16
 
144
  CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
145
  CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
146
  NVCC = nvcc
147
+ NVCCFLAGS = --forward-unknown-to-host-compiler
148
+ ifdef CUDA_DOCKER_ARCH
149
+ NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
150
+ else
151
+ NVCCFLAGS += -arch=native
152
+ endif # CUDA_DOCKER_ARCH
153
+ ifdef LLAMA_CUDA_FORCE_DMMV
154
+ NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
155
+ endif # LLAMA_CUDA_FORCE_DMMV
156
  ifdef LLAMA_CUDA_DMMV_X
157
  NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
158
  else
159
  NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
160
  endif # LLAMA_CUDA_DMMV_X
161
+ ifdef LLAMA_CUDA_MMV_Y
162
+ NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
163
+ else ifdef LLAMA_CUDA_DMMV_Y
164
+ NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
165
  else
166
+ NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
167
+ endif # LLAMA_CUDA_MMV_Y
168
  ifdef LLAMA_CUDA_DMMV_F16
169
  NVCCFLAGS += -DGGML_CUDA_DMMV_F16
170
  endif # LLAMA_CUDA_DMMV_F16
convert.py CHANGED
@@ -828,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
828
 
829
 
830
  SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
 
831
  'F16': DT_F16,
832
  'F32': DT_F32,
833
  'I32': DT_I32,
 
828
 
829
 
830
  SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
831
+ 'BF16': DT_BF16,
832
  'F16': DT_F16,
833
  'F32': DT_F32,
834
  'I32': DT_I32,
examples/baby-llama/baby-llama.cpp CHANGED
@@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
31
  return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
32
  }
33
 
 
 
 
 
 
 
 
 
 
 
 
34
  struct ggml_tensor * randomize_tensor(
35
  struct ggml_tensor * tensor,
36
  int ndims,
@@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
1569
  int n_tokens = model.hparams.n_ctx;
1570
  int n_vocab = model.hparams.n_vocab;
1571
 
 
 
1572
  for (int ex=0; ex<n_examples; ++ex) {
1573
  struct ggml_init_params params = {
1574
  /*.mem_size =*/ compute_size,
@@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
1586
  int n_past = 0;
1587
 
1588
  ggml_cgraph gf = {};
1589
- gf.n_threads = 1;
1590
 
1591
  get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
1592
 
@@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
1595
  struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
1596
 
1597
  ggml_build_forward_expand(&gf, e);
1598
- ggml_graph_compute(ctx0, &gf);
1599
 
1600
  float error_before_opt = ggml_get_f32_1d(e, 0);
1601
 
@@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
1611
  ggml_opt(ctx0, opt_params_lbfgs, e);
1612
  //
1613
  ggml_build_forward_expand(&gf, e);
1614
- ggml_graph_compute(ctx0, &gf);
1615
 
1616
  float error_after_opt = ggml_get_f32_1d(e, 0);
1617
 
@@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
1659
  struct ggml_context * ctx0 = ggml_init(params);
1660
 
1661
  ggml_cgraph gf = {};
1662
- gf.n_threads = 1;
1663
 
1664
  int n_past = 0;
1665
  struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
1666
 
1667
  ggml_build_forward_expand(&gf, logits);
1668
- ggml_graph_compute(ctx0, &gf);
1669
 
1670
  struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
1671
  struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
1687
  }
1688
 
1689
  print_matrix(model.tok_embeddings);
1690
-
1691
  printf("done\n");
 
1692
  // ggml_free(kv_self.ctx);
1693
  // ggml_free(model_lora.ctx);
1694
  ggml_free(model.ctx);
 
1695
  return 0;
1696
  }
 
31
  return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
32
  }
33
 
34
+ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
35
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
36
+
37
+ if (plan.work_size > 0) {
38
+ buf.resize(plan.work_size);
39
+ plan.work_data = buf.data();
40
+ }
41
+
42
+ ggml_graph_compute(graph, &plan);
43
+ }
44
+
45
  struct ggml_tensor * randomize_tensor(
46
  struct ggml_tensor * tensor,
47
  int ndims,
 
1580
  int n_tokens = model.hparams.n_ctx;
1581
  int n_vocab = model.hparams.n_vocab;
1582
 
1583
+ std::vector<uint8_t> work_buffer;
1584
+
1585
  for (int ex=0; ex<n_examples; ++ex) {
1586
  struct ggml_init_params params = {
1587
  /*.mem_size =*/ compute_size,
 
1599
  int n_past = 0;
1600
 
1601
  ggml_cgraph gf = {};
 
1602
 
1603
  get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
1604
 
 
1607
  struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
1608
 
1609
  ggml_build_forward_expand(&gf, e);
1610
+ ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
1611
 
1612
  float error_before_opt = ggml_get_f32_1d(e, 0);
1613
 
 
1623
  ggml_opt(ctx0, opt_params_lbfgs, e);
1624
  //
1625
  ggml_build_forward_expand(&gf, e);
1626
+ ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
1627
 
1628
  float error_after_opt = ggml_get_f32_1d(e, 0);
1629
 
 
1671
  struct ggml_context * ctx0 = ggml_init(params);
1672
 
1673
  ggml_cgraph gf = {};
 
1674
 
1675
  int n_past = 0;
1676
  struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
1677
 
1678
  ggml_build_forward_expand(&gf, logits);
1679
+ ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
1680
 
1681
  struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
1682
  struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
 
1698
  }
1699
 
1700
  print_matrix(model.tok_embeddings);
 
1701
  printf("done\n");
1702
+
1703
  // ggml_free(kv_self.ctx);
1704
  // ggml_free(model_lora.ctx);
1705
  ggml_free(model.ctx);
1706
+
1707
  return 0;
1708
  }
examples/benchmark/benchmark-matmult.cpp CHANGED
@@ -20,6 +20,17 @@
20
  #pragma warning(disable: 4244 4267) // possible loss of data
21
  #endif
22
 
 
 
 
 
 
 
 
 
 
 
 
23
  float tensor_sum_elements(const ggml_tensor * tensor) {
24
  float sum = 0;
25
  if (tensor->type==GGML_TYPE_F32) {
@@ -159,13 +170,14 @@ int main(int argc, char ** argv) {
159
  // printf("Creating compute graph\n");
160
  struct ggml_cgraph gf = ggml_build_forward(m11xm2);
161
 
162
- gf.n_threads=benchmark_params.n_threads;
163
- printf("cgraph->n_threads=%i\n",gf.n_threads);
164
 
165
  TENSOR_DUMP(m11);
166
  TENSOR_DUMP(m2);
167
 
168
- ggml_graph_compute(ctx, &gf);
 
 
169
 
170
  TENSOR_DUMP(gf.nodes[0]);
171
 
@@ -187,7 +199,6 @@ int main(int argc, char ** argv) {
187
 
188
  // printf("Creating compute graph\n");
189
  struct ggml_cgraph gf31 = ggml_build_forward(q31);
190
- gf31.n_threads=benchmark_params.n_threads;
191
 
192
  // Set up a second graph computation to make sure we override the CPU cache lines
193
  // printf("Creating new tensor q12 & Running quantize\n");
@@ -199,8 +210,7 @@ int main(int argc, char ** argv) {
199
 
200
  //printf("Creating compute graph\n");
201
  struct ggml_cgraph gf32 = ggml_build_forward(q32);
202
- gf32.n_threads=benchmark_params.n_threads;
203
- printf("cgraph->n_threads=%i\n",gf31.n_threads);
204
 
205
  const int dimx = sizex;
206
  const int dimy = sizey;
@@ -221,14 +231,15 @@ int main(int argc, char ** argv) {
221
 
222
  long long int start = ggml_time_us();
223
  //printf("Running ggml_graph_compute\n");
224
- ggml_graph_compute(ctx, &gf31);
 
225
  long long int stop = ggml_time_us();
226
  long long int usec = stop-start;
227
  double gflops = (double)(flops_per_matrix)/usec/1000.0;
228
  gflops_sum += gflops;
229
  printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
230
  i,
231
- gf31.n_threads,
232
  sizex, sizey, sizez, flops_per_matrix,
233
  usec,gflops);
234
 
@@ -253,7 +264,7 @@ int main(int argc, char ** argv) {
253
  }
254
 
255
  // Running a different graph computation to make sure we override the CPU cache lines
256
- ggml_graph_compute(ctx, &gf32);
257
  }
258
  printf("\n");
259
  printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
 
20
  #pragma warning(disable: 4244 4267) // possible loss of data
21
  #endif
22
 
23
+ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
24
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
25
+
26
+ if (plan.work_size > 0) {
27
+ buf.resize(plan.work_size);
28
+ plan.work_data = buf.data();
29
+ }
30
+
31
+ ggml_graph_compute(graph, &plan);
32
+ }
33
+
34
  float tensor_sum_elements(const ggml_tensor * tensor) {
35
  float sum = 0;
36
  if (tensor->type==GGML_TYPE_F32) {
 
170
  // printf("Creating compute graph\n");
171
  struct ggml_cgraph gf = ggml_build_forward(m11xm2);
172
 
173
+ printf("n_threads=%i\n", benchmark_params.n_threads);
 
174
 
175
  TENSOR_DUMP(m11);
176
  TENSOR_DUMP(m2);
177
 
178
+ std::vector<uint8_t> work_buffer;
179
+
180
+ ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
181
 
182
  TENSOR_DUMP(gf.nodes[0]);
183
 
 
199
 
200
  // printf("Creating compute graph\n");
201
  struct ggml_cgraph gf31 = ggml_build_forward(q31);
 
202
 
203
  // Set up a second graph computation to make sure we override the CPU cache lines
204
  // printf("Creating new tensor q12 & Running quantize\n");
 
210
 
211
  //printf("Creating compute graph\n");
212
  struct ggml_cgraph gf32 = ggml_build_forward(q32);
213
+ printf("n_threads=%i\n", benchmark_params.n_threads);
 
214
 
215
  const int dimx = sizex;
216
  const int dimy = sizey;
 
231
 
232
  long long int start = ggml_time_us();
233
  //printf("Running ggml_graph_compute\n");
234
+ ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
235
+
236
  long long int stop = ggml_time_us();
237
  long long int usec = stop-start;
238
  double gflops = (double)(flops_per_matrix)/usec/1000.0;
239
  gflops_sum += gflops;
240
  printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
241
  i,
242
+ benchmark_params.n_threads,
243
  sizex, sizey, sizez, flops_per_matrix,
244
  usec,gflops);
245
 
 
264
  }
265
 
266
  // Running a different graph computation to make sure we override the CPU cache lines
267
+ ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
268
  }
269
  printf("\n");
270
  printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
examples/common.cpp CHANGED
@@ -236,6 +236,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
236
  break;
237
  }
238
  params.mirostat_tau = std::stof(argv[i]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  } else if (arg == "-b" || arg == "--batch-size") {
240
  if (++i >= argc) {
241
  invalid_param = true;
@@ -267,7 +285,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
267
  break;
268
  }
269
  params.lora_adapter = argv[i];
270
- params.use_mmap = false;
271
  } else if (arg == "--lora-base") {
272
  if (++i >= argc) {
273
  invalid_param = true;
@@ -418,6 +435,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
418
 
419
  if (escape_prompt) {
420
  process_escapes(params.prompt);
 
 
421
  }
422
 
423
  return true;
@@ -468,6 +487,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
468
  fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
469
  fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
470
  fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
 
 
 
 
471
  fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
472
  fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
473
  fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
@@ -497,7 +520,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
497
  fprintf(stderr, " --mtest compute maximum memory usage\n");
498
  fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
499
  fprintf(stderr, " --verbose-prompt print prompt before generation\n");
500
- fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
501
  fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
502
  fprintf(stderr, " -m FNAME, --model FNAME\n");
503
  fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
@@ -534,7 +557,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
534
  return res;
535
  }
536
 
537
- std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
538
  auto lparams = llama_context_default_params();
539
 
540
  lparams.n_ctx = params.n_ctx;
@@ -550,6 +573,12 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
550
  lparams.logits_all = params.perplexity;
551
  lparams.embedding = params.embedding;
552
 
 
 
 
 
 
 
553
  llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
554
  if (model == NULL) {
555
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
 
236
  break;
237
  }
238
  params.mirostat_tau = std::stof(argv[i]);
239
+ } else if (arg == "--cfg-negative-prompt") {
240
+ if (++i >= argc) {
241
+ invalid_param = true;
242
+ break;
243
+ }
244
+ params.cfg_negative_prompt = argv[i];
245
+ } else if (arg == "--cfg-scale") {
246
+ if (++i >= argc) {
247
+ invalid_param = true;
248
+ break;
249
+ }
250
+ params.cfg_scale = std::stof(argv[i]);
251
+ } else if (arg == "--cfg-smooth-factor") {
252
+ if (++i >= argc) {
253
+ invalid_param = true;
254
+ break;
255
+ }
256
+ params.cfg_smooth_factor = std::stof(argv[i]);
257
  } else if (arg == "-b" || arg == "--batch-size") {
258
  if (++i >= argc) {
259
  invalid_param = true;
 
285
  break;
286
  }
287
  params.lora_adapter = argv[i];
 
288
  } else if (arg == "--lora-base") {
289
  if (++i >= argc) {
290
  invalid_param = true;
 
435
 
436
  if (escape_prompt) {
437
  process_escapes(params.prompt);
438
+ process_escapes(params.input_prefix);
439
+ process_escapes(params.input_suffix);
440
  }
441
 
442
  return true;
 
487
  fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
488
  fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
489
  fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
490
+ fprintf(stderr, " --cfg-negative-prompt PROMPT \n");
491
+ fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n");
492
+ fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
493
+ fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
494
  fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
495
  fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
496
  fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
 
520
  fprintf(stderr, " --mtest compute maximum memory usage\n");
521
  fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
522
  fprintf(stderr, " --verbose-prompt print prompt before generation\n");
523
+ fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
524
  fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
525
  fprintf(stderr, " -m FNAME, --model FNAME\n");
526
  fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
 
557
  return res;
558
  }
559
 
560
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
561
  auto lparams = llama_context_default_params();
562
 
563
  lparams.n_ctx = params.n_ctx;
 
573
  lparams.logits_all = params.perplexity;
574
  lparams.embedding = params.embedding;
575
 
576
+ return lparams;
577
+ }
578
+
579
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
580
+ auto lparams = llama_context_params_from_gpt_params(params);
581
+
582
  llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
583
  if (model == NULL) {
584
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
examples/common.h CHANGED
@@ -48,6 +48,12 @@ struct gpt_params {
48
  float mirostat_tau = 5.00f; // target entropy
49
  float mirostat_eta = 0.10f; // learning rate
50
 
 
 
 
 
 
 
51
  std::string model = "models/7B/ggml-model.bin"; // model path
52
  std::string model_alias = "unknown"; // model alias
53
  std::string prompt = "";
@@ -99,6 +105,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
99
  //
100
 
101
  std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
 
102
 
103
  //
104
  // Console utils
 
48
  float mirostat_tau = 5.00f; // target entropy
49
  float mirostat_eta = 0.10f; // learning rate
50
 
51
+ // Classifier-Free Guidance
52
+ // https://arxiv.org/abs/2306.17806
53
+ std::string cfg_negative_prompt; // string to help guidance
54
+ float cfg_scale = 1.f; // How strong is guidance
55
+ float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits
56
+
57
  std::string model = "models/7B/ggml-model.bin"; // model path
58
  std::string model_alias = "unknown"; // model alias
59
  std::string prompt = "";
 
105
  //
106
 
107
  std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
108
+ struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
109
 
110
  //
111
  // Console utils
examples/embd-input/embd-input-lib.cpp CHANGED
@@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
34
  }
35
  fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
36
 
37
- llama_init_backend(params.numa);
38
 
39
  llama_model * model;
40
  llama_context * ctx;
 
34
  }
35
  fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
36
 
37
+ llama_backend_init(params.numa);
38
 
39
  llama_model * model;
40
  llama_context * ctx;
examples/embedding/embedding.cpp CHANGED
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
35
  params.prompt = gpt_random_prompt(rng);
36
  }
37
 
38
- llama_init_backend(params.numa);
39
 
40
  llama_model * model;
41
  llama_context * ctx;
@@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
93
  llama_free(ctx);
94
  llama_free_model(model);
95
 
 
 
96
  return 0;
97
  }
 
35
  params.prompt = gpt_random_prompt(rng);
36
  }
37
 
38
+ llama_backend_init(params.numa);
39
 
40
  llama_model * model;
41
  llama_context * ctx;
 
93
  llama_free(ctx);
94
  llama_free_model(model);
95
 
96
+ llama_backend_free();
97
+
98
  return 0;
99
  }
examples/main/README.md CHANGED
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
293
  - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
294
  - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
295
  - `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
296
- - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
297
  - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 
293
  - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
294
  - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
295
  - `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
296
+ - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
297
  - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
examples/main/main.cpp CHANGED
@@ -105,14 +105,20 @@ int main(int argc, char ** argv) {
105
  params.prompt = gpt_random_prompt(rng);
106
  }
107
 
108
- llama_init_backend(params.numa);
109
 
110
  llama_model * model;
111
  llama_context * ctx;
 
112
  g_ctx = &ctx;
113
 
114
  // load the model and apply lora adapter, if any
115
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
 
 
 
 
116
  if (model == NULL) {
117
  fprintf(stderr, "%s: error: unable to load model\n", __func__);
118
  return 1;
@@ -183,15 +189,28 @@ int main(int argc, char ** argv) {
183
  // tokenize the prompt
184
  std::vector<llama_token> embd_inp;
185
 
186
- if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
187
- // Add a space in front of the first character to match OG llama tokenizer behavior
188
- params.prompt.insert(0, 1, ' ');
189
 
 
190
  embd_inp = ::llama_tokenize(ctx, params.prompt, true);
191
  } else {
192
  embd_inp = session_tokens;
193
  }
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  const int n_ctx = llama_n_ctx(ctx);
196
 
197
  if ((int) embd_inp.size() > n_ctx - 4) {
@@ -258,6 +277,16 @@ int main(int argc, char ** argv) {
258
  for (int i = 0; i < (int) embd_inp.size(); i++) {
259
  fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
260
  }
 
 
 
 
 
 
 
 
 
 
261
  if (params.n_keep > 0) {
262
  fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
263
  for (int i = 0; i < params.n_keep; i++) {
@@ -334,11 +363,13 @@ int main(int argc, char ** argv) {
334
  int n_remain = params.n_predict;
335
  int n_consumed = 0;
336
  int n_session_consumed = 0;
 
337
 
338
  // the first thing we will do is to output the prompt, so set color accordingly
339
  console_set_color(con_st, CONSOLE_COLOR_PROMPT);
340
 
341
  std::vector<llama_token> embd;
 
342
 
343
  // do one empty run to warm up the model
344
  {
@@ -367,11 +398,12 @@ int main(int argc, char ** argv) {
367
  // if we run out of context:
368
  // - take the n_keep first tokens from the original prompt (via n_past)
369
  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
370
- if (n_past + (int) embd.size() > n_ctx) {
371
  const int n_left = n_past - params.n_keep;
372
 
373
  // always keep the first token - BOS
374
  n_past = std::max(1, params.n_keep);
 
375
 
376
  // insert n_left/2 tokens at the start of embd from last_n_tokens
377
  embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -412,6 +444,48 @@ int main(int argc, char ** argv) {
412
 
413
  // evaluate tokens in batches
414
  // embd is typically prepared beforehand to fit within a batch, but not always
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
416
  int n_eval = (int) embd.size() - i;
417
  if (n_eval > params.n_batch) {
@@ -431,6 +505,7 @@ int main(int argc, char ** argv) {
431
  }
432
 
433
  embd.clear();
 
434
 
435
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
436
  // out of user input, sample next token
@@ -473,6 +548,10 @@ int main(int argc, char ** argv) {
473
 
474
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
475
 
 
 
 
 
476
  // Apply penalties
477
  float nl_logit = logits[llama_token_nl()];
478
  auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
@@ -668,8 +747,11 @@ int main(int argc, char ** argv) {
668
  }
669
 
670
  llama_print_timings(ctx);
 
671
  llama_free(ctx);
672
  llama_free_model(model);
673
 
 
 
674
  return 0;
675
  }
 
105
  params.prompt = gpt_random_prompt(rng);
106
  }
107
 
108
+ llama_backend_init(params.numa);
109
 
110
  llama_model * model;
111
  llama_context * ctx;
112
+ llama_context * ctx_guidance = NULL;
113
  g_ctx = &ctx;
114
 
115
  // load the model and apply lora adapter, if any
116
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
117
+ if (params.cfg_scale > 1.f) {
118
+ struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
119
+ ctx_guidance = llama_new_context_with_model(model, lparams);
120
+ }
121
+
122
  if (model == NULL) {
123
  fprintf(stderr, "%s: error: unable to load model\n", __func__);
124
  return 1;
 
189
  // tokenize the prompt
190
  std::vector<llama_token> embd_inp;
191
 
192
+ // Add a space in front of the first character to match OG llama tokenizer behavior
193
+ params.prompt.insert(0, 1, ' ');
 
194
 
195
+ if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
196
  embd_inp = ::llama_tokenize(ctx, params.prompt, true);
197
  } else {
198
  embd_inp = session_tokens;
199
  }
200
 
201
+ // Tokenize negative prompt
202
+ std::vector<llama_token> guidance_inp;
203
+ int guidance_offset = 0;
204
+ int original_prompt_len = 0;
205
+ if (ctx_guidance) {
206
+ params.cfg_negative_prompt.insert(0, 1, ' ');
207
+ guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
208
+
209
+ std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
210
+ original_prompt_len = original_inp.size();
211
+ guidance_offset = (int)guidance_inp.size() - original_prompt_len;
212
+ }
213
+
214
  const int n_ctx = llama_n_ctx(ctx);
215
 
216
  if ((int) embd_inp.size() > n_ctx - 4) {
 
277
  for (int i = 0; i < (int) embd_inp.size(); i++) {
278
  fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
279
  }
280
+
281
+ if (ctx_guidance) {
282
+ fprintf(stderr, "\n");
283
+ fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
284
+ fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
285
+ for (int i = 0; i < (int) guidance_inp.size(); i++) {
286
+ fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
287
+ }
288
+ }
289
+
290
  if (params.n_keep > 0) {
291
  fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
292
  for (int i = 0; i < params.n_keep; i++) {
 
363
  int n_remain = params.n_predict;
364
  int n_consumed = 0;
365
  int n_session_consumed = 0;
366
+ int n_past_guidance = 0;
367
 
368
  // the first thing we will do is to output the prompt, so set color accordingly
369
  console_set_color(con_st, CONSOLE_COLOR_PROMPT);
370
 
371
  std::vector<llama_token> embd;
372
+ std::vector<llama_token> embd_guidance;
373
 
374
  // do one empty run to warm up the model
375
  {
 
398
  // if we run out of context:
399
  // - take the n_keep first tokens from the original prompt (via n_past)
400
  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
401
+ if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
402
  const int n_left = n_past - params.n_keep;
403
 
404
  // always keep the first token - BOS
405
  n_past = std::max(1, params.n_keep);
406
+ n_past_guidance = std::max(1, params.n_keep + guidance_offset);
407
 
408
  // insert n_left/2 tokens at the start of embd from last_n_tokens
409
  embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
 
444
 
445
  // evaluate tokens in batches
446
  // embd is typically prepared beforehand to fit within a batch, but not always
447
+
448
+ if (ctx_guidance) {
449
+ int input_size = 0;
450
+ llama_token* input_buf = NULL;
451
+
452
+ if (n_past_guidance < (int) guidance_inp.size()) {
453
+ // Guidance context should have the same data with these modifications:
454
+ //
455
+ // * Replace the initial prompt
456
+ // * Shift everything by guidance_offset
457
+ embd_guidance = guidance_inp;
458
+ if (embd.begin() + original_prompt_len < embd.end()) {
459
+ embd_guidance.insert(
460
+ embd_guidance.end(),
461
+ embd.begin() + original_prompt_len,
462
+ embd.end()
463
+ );
464
+ }
465
+
466
+ input_buf = embd_guidance.data();
467
+ input_size = embd_guidance.size();
468
+ //fprintf(stderr, "\n---------------------\n");
469
+ //for (int i = 0; i < (int) embd_guidance.size(); i++) {
470
+ //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
471
+ //}
472
+ //fprintf(stderr, "\n---------------------\n");
473
+ } else {
474
+ input_buf = embd.data();
475
+ input_size = embd.size();
476
+ }
477
+
478
+ for (int i = 0; i < input_size; i += params.n_batch) {
479
+ int n_eval = std::min(input_size - i, params.n_batch);
480
+ if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
481
+ fprintf(stderr, "%s : failed to eval\n", __func__);
482
+ return 1;
483
+ }
484
+
485
+ n_past_guidance += n_eval;
486
+ }
487
+ }
488
+
489
  for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
490
  int n_eval = (int) embd.size() - i;
491
  if (n_eval > params.n_batch) {
 
505
  }
506
 
507
  embd.clear();
508
+ embd_guidance.clear();
509
 
510
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
511
  // out of user input, sample next token
 
548
 
549
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
550
 
551
+ if (ctx_guidance) {
552
+ llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
553
+ }
554
+
555
  // Apply penalties
556
  float nl_logit = logits[llama_token_nl()];
557
  auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
 
747
  }
748
 
749
  llama_print_timings(ctx);
750
+ if (ctx_guidance) { llama_free(ctx_guidance); }
751
  llama_free(ctx);
752
  llama_free_model(model);
753
 
754
+ llama_backend_free();
755
+
756
  return 0;
757
  }
examples/metal/metal.cpp CHANGED
@@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
35
  struct ggml_context * ctx_eval = NULL;
36
 
37
  struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
38
- gf.n_threads = 1;
39
 
40
  // this allocates all Metal resources and memory buffers
41
- auto * ctx_metal = ggml_metal_init();
42
 
43
  const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
44
  const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
 
35
  struct ggml_context * ctx_eval = NULL;
36
 
37
  struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
 
38
 
39
  // this allocates all Metal resources and memory buffers
40
+ auto * ctx_metal = ggml_metal_init(1);
41
 
42
  const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
43
  const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
examples/perplexity/perplexity.cpp CHANGED
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
147
  params.prompt = gpt_random_prompt(rng);
148
  }
149
 
150
- llama_init_backend(params.numa);
151
 
152
  llama_model * model;
153
  llama_context * ctx;
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
172
  llama_free(ctx);
173
  llama_free_model(model);
174
 
 
 
175
  return 0;
176
  }
 
147
  params.prompt = gpt_random_prompt(rng);
148
  }
149
 
150
+ llama_backend_init(params.numa);
151
 
152
  llama_model * model;
153
  llama_context * ctx;
 
172
  llama_free(ctx);
173
  llama_free_model(model);
174
 
175
+ llama_backend_free();
176
+
177
  return 0;
178
  }
examples/quantize/quantize.cpp CHANGED
@@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
178
  usage(argv[0]);
179
  }
180
 
181
- llama_init_backend(false);
182
 
183
  // parse command line arguments
184
  const std::string fname_inp = argv[arg_idx];
@@ -253,5 +253,7 @@ int main(int argc, char ** argv) {
253
  printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
254
  }
255
 
 
 
256
  return 0;
257
  }
 
178
  usage(argv[0]);
179
  }
180
 
181
+ llama_backend_init(false);
182
 
183
  // parse command line arguments
184
  const std::string fname_inp = argv[arg_idx];
 
253
  printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
254
  }
255
 
256
+ llama_backend_free();
257
+
258
  return 0;
259
  }
examples/server/README.md CHANGED
@@ -16,7 +16,7 @@ Command line options:
16
  - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
17
  - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
18
  - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
19
- - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
20
  - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
21
  - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
22
  - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
 
16
  - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
17
  - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
18
  - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
19
+ - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
20
  - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
21
  - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
22
  - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
examples/server/server.cpp CHANGED
@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
632
  fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
633
  fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
634
  fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
635
- fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
636
  fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
637
  fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
638
  fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
@@ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
820
  break;
821
  }
822
  params.lora_adapter = argv[i];
823
- params.use_mmap = false;
824
  }
825
  else if (arg == "--lora-base")
826
  {
@@ -1079,7 +1078,7 @@ int main(int argc, char **argv)
1079
  params.model_alias = params.model;
1080
  }
1081
 
1082
- llama_init_backend(params.numa);
1083
 
1084
  LOG_INFO("build info", {{"build", BUILD_NUMBER},
1085
  {"commit", BUILD_COMMIT}});
@@ -1309,5 +1308,7 @@ int main(int argc, char **argv)
1309
  return 1;
1310
  }
1311
 
 
 
1312
  return 0;
1313
  }
 
632
  fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
633
  fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
634
  fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
635
+ fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
636
  fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
637
  fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
638
  fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
 
820
  break;
821
  }
822
  params.lora_adapter = argv[i];
 
823
  }
824
  else if (arg == "--lora-base")
825
  {
 
1078
  params.model_alias = params.model;
1079
  }
1080
 
1081
+ llama_backend_init(params.numa);
1082
 
1083
  LOG_INFO("build info", {{"build", BUILD_NUMBER},
1084
  {"commit", BUILD_COMMIT}});
 
1308
  return 1;
1309
  }
1310
 
1311
+ llama_backend_free();
1312
+
1313
  return 0;
1314
  }
examples/simple/simple.cpp CHANGED
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
66
  // Init LLM :
67
  //---------------------------------
68
 
69
- llama_init_backend(params.numa);
70
 
71
  llama_model * model;
72
  llama_context * ctx;
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
173
  llama_free( ctx );
174
  llama_free_model( model );
175
 
 
 
176
  return 0;
177
  }
178
 
 
66
  // Init LLM :
67
  //---------------------------------
68
 
69
+ llama_backend_init(params.numa);
70
 
71
  llama_model * model;
72
  llama_context * ctx;
 
173
  llama_free( ctx );
174
  llama_free_model( model );
175
 
176
+ llama_backend_free();
177
+
178
  return 0;
179
  }
180
 
examples/train-text-from-scratch/train-text-from-scratch.cpp CHANGED
@@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
60
  return rnd->rd(rnd->gen);
61
  }
62
 
 
 
 
 
 
 
 
 
 
 
 
63
  struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
64
  float scale = 1.0f; // xavier
65
  switch (tensor->n_dims) {
@@ -1343,17 +1354,9 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
1343
  }
1344
  }
1345
 
1346
- if (t->src0) {
1347
- expand(g, t->src0);
1348
- }
1349
-
1350
- if (t->src1) {
1351
- expand(g, t->src1);
1352
- }
1353
-
1354
- for (int i = 0; i < GGML_MAX_OPT; ++i) {
1355
- if (t->opt[i]) {
1356
- expand(g, t->opt[i]);
1357
  }
1358
  }
1359
 
@@ -1426,11 +1429,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
1426
 
1427
  gf->n_nodes = 0;
1428
  gf->n_leafs = 0;
1429
- gf->work_size = 0;
1430
  gf->perf_runs = 0;
1431
  gf->perf_cycles = 0;
1432
  gf->perf_time_us = 0;
1433
- gf->work = NULL;
1434
 
1435
  const auto & hparams = model->hparams;
1436
  //const int n_ctx = hparams.n_ctx;
@@ -3162,6 +3163,7 @@ int main(int argc, char ** argv) {
3162
  printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
3163
  // ggml_print_tensor_objects(model.ctx);
3164
 
 
3165
  size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
3166
  uint8_t * compute_addr = new uint8_t[compute_size];
3167
 
@@ -3183,6 +3185,8 @@ int main(int argc, char ** argv) {
3183
  GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
3184
  }
3185
 
 
 
3186
  printf("%s: begin training\n", __func__);
3187
 
3188
  for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3217,9 +3221,6 @@ int main(int argc, char ** argv) {
3217
  struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
3218
  struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
3219
 
3220
- // ggml_cgraph gf = {};
3221
- gf->n_threads = params.n_threads;
3222
- gb->n_threads = params.n_threads;
3223
 
3224
  get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
3225
 
@@ -3248,7 +3249,7 @@ int main(int argc, char ** argv) {
3248
  *gb = ggml_build_backward(ctx0, gf, true);
3249
  }
3250
 
3251
- ggml_graph_compute(ctx0, gf);
3252
 
3253
  size_t used_mem_before_opt = ggml_used_mem(ctx0);
3254
 
@@ -3272,7 +3273,7 @@ int main(int argc, char ** argv) {
3272
  model.train_samples += n_batch;
3273
  model.train_tokens += n_batch * n_tokens;
3274
 
3275
- ggml_graph_compute(ctx0, gf);
3276
 
3277
  float error_after_opt = ggml_get_f32_1d(loss, 0);
3278
 
@@ -3354,13 +3355,12 @@ int main(int argc, char ** argv) {
3354
  struct ggml_context * ctx0 = ggml_init(cparams);
3355
 
3356
  ggml_cgraph gf = {};
3357
- gf.n_threads = params.n_threads;
3358
 
3359
  int n_past = 0;
3360
  struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
3361
 
3362
  ggml_build_forward_expand(&gf, logits);
3363
- ggml_graph_compute(ctx0, &gf);
3364
 
3365
  //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
3366
  //struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -3386,6 +3386,7 @@ int main(int argc, char ** argv) {
3386
  delete[] compute_addr;
3387
  delete[] compute_buf_0;
3388
  delete[] compute_buf_1;
 
3389
  llama_free(lctx);
3390
  llama_free_model(lmodel);
3391
  ggml_free(model.ctx);
 
60
  return rnd->rd(rnd->gen);
61
  }
62
 
63
+ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
64
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
65
+
66
+ if (plan.work_size > 0) {
67
+ buf.resize(plan.work_size);
68
+ plan.work_data = buf.data();
69
+ }
70
+
71
+ ggml_graph_compute(graph, &plan);
72
+ }
73
+
74
  struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
75
  float scale = 1.0f; // xavier
76
  switch (tensor->n_dims) {
 
1354
  }
1355
  }
1356
 
1357
+ for (int i = 0; i < GGML_MAX_SRC; ++i) {
1358
+ if (t->src[i]) {
1359
+ expand(g, t->src[i]);
 
 
 
 
 
 
 
 
1360
  }
1361
  }
1362
 
 
1429
 
1430
  gf->n_nodes = 0;
1431
  gf->n_leafs = 0;
 
1432
  gf->perf_runs = 0;
1433
  gf->perf_cycles = 0;
1434
  gf->perf_time_us = 0;
 
1435
 
1436
  const auto & hparams = model->hparams;
1437
  //const int n_ctx = hparams.n_ctx;
 
3163
  printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
3164
  // ggml_print_tensor_objects(model.ctx);
3165
 
3166
+ // TODO: use std::vector<uint8_t> intead of "new"
3167
  size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
3168
  uint8_t * compute_addr = new uint8_t[compute_size];
3169
 
 
3185
  GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
3186
  }
3187
 
3188
+ std::vector<uint8_t> work_buffer;
3189
+
3190
  printf("%s: begin training\n", __func__);
3191
 
3192
  for (int ex = 0; ex < params.n_examples; ++ex) {
 
3221
  struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
3222
  struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
3223
 
 
 
 
3224
 
3225
  get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
3226
 
 
3249
  *gb = ggml_build_backward(ctx0, gf, true);
3250
  }
3251
 
3252
+ ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
3253
 
3254
  size_t used_mem_before_opt = ggml_used_mem(ctx0);
3255
 
 
3273
  model.train_samples += n_batch;
3274
  model.train_tokens += n_batch * n_tokens;
3275
 
3276
+ ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
3277
 
3278
  float error_after_opt = ggml_get_f32_1d(loss, 0);
3279
 
 
3355
  struct ggml_context * ctx0 = ggml_init(cparams);
3356
 
3357
  ggml_cgraph gf = {};
 
3358
 
3359
  int n_past = 0;
3360
  struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
3361
 
3362
  ggml_build_forward_expand(&gf, logits);
3363
+ ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
3364
 
3365
  //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
3366
  //struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
 
3386
  delete[] compute_addr;
3387
  delete[] compute_buf_0;
3388
  delete[] compute_buf_1;
3389
+
3390
  llama_free(lctx);
3391
  llama_free_model(lmodel);
3392
  ggml_free(model.ctx);
expose.cpp CHANGED
@@ -220,6 +220,14 @@ extern "C"
220
  return generation_finished;
221
  }
222
 
 
 
 
 
 
 
 
 
223
  const char* get_pending_output() {
224
  return gpttype_get_pending_output().c_str();
225
  }
 
220
  return generation_finished;
221
  }
222
 
223
+ float get_last_eval_time() {
224
+ return last_eval_time;
225
+ }
226
+
227
+ float get_last_process_time() {
228
+ return last_process_time;
229
+ }
230
+
231
  const char* get_pending_output() {
232
  return gpttype_get_pending_output().c_str();
233
  }
expose.h CHANGED
@@ -36,6 +36,7 @@ struct load_model_inputs
36
  const int debugmode = 0;
37
  const int forceversion = 0;
38
  const int gpulayers = 0;
 
39
  const char * banned_tokens[ban_token_max];
40
  };
41
  struct generation_inputs
@@ -71,3 +72,5 @@ extern std::string lora_filename;
71
  extern std::string lora_base;
72
  extern std::vector<std::string> generated_tokens;
73
  extern bool generation_finished;
 
 
 
36
  const int debugmode = 0;
37
  const int forceversion = 0;
38
  const int gpulayers = 0;
39
+ const bool linear_rope;
40
  const char * banned_tokens[ban_token_max];
41
  };
42
  struct generation_inputs
 
72
  extern std::string lora_base;
73
  extern std::vector<std::string> generated_tokens;
74
  extern bool generation_finished;
75
+ extern float last_eval_time;
76
+ extern float last_process_time;
ggml-cuda.cu CHANGED
@@ -59,8 +59,8 @@ typedef float2 dfloat2;
59
  #endif //GGML_CUDA_DMMV_F16
60
 
61
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
- typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
63
- typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
65
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
  typedef void (*ggml_cuda_op_t)(
@@ -131,7 +131,7 @@ typedef struct {
131
  } block_q8_1;
132
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
 
134
- typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
135
 
136
  //================================= k-quants
137
 
@@ -208,6 +208,7 @@ typedef struct {
208
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
209
 
210
  #define WARP_SIZE 32
 
211
 
212
  #define CUDA_ADD_BLOCK_SIZE 256
213
  #define CUDA_MUL_BLOCK_SIZE 256
@@ -407,7 +408,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
407
 
408
  //================================== k-quants
409
 
410
- static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
411
 
412
  const int i = blockIdx.x;
413
  const block_q2_K * x = (const block_q2_K *) vx;
@@ -440,7 +441,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
440
 
441
  }
442
 
443
- static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
444
 
445
  const int i = blockIdx.x;
446
  const block_q3_K * x = (const block_q3_K *) vx;
@@ -504,7 +505,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
504
  }
505
  #endif
506
 
507
- static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
508
  const block_q4_K * x = (const block_q4_K *) vx;
509
 
510
  const int i = blockIdx.x;
@@ -544,7 +545,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
544
  #endif
545
  }
546
 
547
- static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
548
  const block_q5_K * x = (const block_q5_K *) vx;
549
 
550
  const int i = blockIdx.x;
@@ -590,7 +591,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
590
  #endif
591
  }
592
 
593
- static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
594
  const block_q6_K * x = (const block_q6_K *) vx;
595
 
596
  const int i = blockIdx.x;
@@ -634,7 +635,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
634
  #endif
635
  }
636
 
637
- static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
638
 
639
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
640
 
@@ -742,7 +743,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
742
  }
743
  }
744
 
745
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
746
 
747
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
748
  if (row > nrows) return;
@@ -846,7 +847,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
846
  }
847
  }
848
 
849
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
850
 
851
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
852
  if (row > nrows) return;
@@ -949,7 +950,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
949
  }
950
  }
951
 
952
- static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
953
 
954
  const int row = blockIdx.x;
955
  const int num_blocks_per_row = ncols / QK_K;
@@ -1053,7 +1054,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
1053
  }
1054
  }
1055
 
1056
- static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
1057
 
1058
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1059
 
@@ -1171,7 +1172,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1171
  v.y = x[ib + iqs + 1];
1172
  }
1173
 
1174
- static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1175
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
1176
 
1177
  if (i >= k) {
@@ -1180,10 +1181,10 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1180
 
1181
  block_q8_1 * y = (block_q8_1 *) vy;
1182
 
1183
- const int ib = i / QK8_0; // block index
1184
- const int iqs = i % QK8_0; // quant index
1185
 
1186
- const float xi = x[i];
1187
  float amax = fabsf(xi);
1188
  float sum = xi;
1189
 
@@ -1207,7 +1208,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
1207
  }
1208
 
1209
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1210
- static __global__ void dequantize_block(const void * vx, float * y, const int k) {
1211
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1212
 
1213
  if (i >= k) {
@@ -1227,7 +1228,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
1227
  y[iybs + iqs + y_offset] = v.y;
1228
  }
1229
 
1230
- static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1231
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1232
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1233
 
@@ -1252,7 +1253,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
1252
  #endif // __CUDA_ARCH__ >= 600
1253
  }
1254
 
1255
- static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1256
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1257
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1258
 
@@ -1277,7 +1278,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
1277
  #endif // __CUDA_ARCH__ >= 600
1278
  }
1279
 
1280
- static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1281
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1282
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1283
 
@@ -1312,7 +1313,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
1312
  #endif // __CUDA_ARCH__ >= 600
1313
  }
1314
 
1315
- static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1316
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1317
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1318
 
@@ -1346,7 +1347,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
1346
  #endif // __CUDA_ARCH__ >= 600
1347
  }
1348
 
1349
- static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1350
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1351
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1352
 
@@ -1366,7 +1367,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
1366
  }
1367
 
1368
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1369
- static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
1370
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1371
 
1372
  if (row >= nrows) {
@@ -1404,7 +1405,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1404
  }
1405
 
1406
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1407
- static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
1408
  // qk = quantized weights per x block
1409
  // qr = number of quantized weights per data value in x block
1410
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -1471,7 +1472,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
1471
  }
1472
  }
1473
 
1474
- static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1475
  const half * x = (const half *) vx;
1476
 
1477
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1518,7 +1519,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1518
  }
1519
 
1520
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1521
- const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1522
  const int row_stride_x, const int channel_stride_x) {
1523
 
1524
  const half * x = (const half *) vx;
@@ -1714,9 +1715,9 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
1714
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1715
  }
1716
 
1717
- static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
1718
  const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1719
- quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
1720
  }
1721
 
1722
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -2380,16 +2381,15 @@ inline void ggml_cuda_op_mul_mat_vec(
2380
  src0->type == GGML_TYPE_Q5_1 ||
2381
  src0->type == GGML_TYPE_Q8_0;
2382
 
2383
- // The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
2384
- // However, they have bad performance with Pascal cards.
2385
- // Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
2386
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
2387
  #endif
2388
 
2389
  if (use_mul_mat_vec_q) {
 
 
2390
  size_t as;
2391
- void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
2392
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
2393
 
2394
  switch (src0->type) {
2395
  case GGML_TYPE_Q4_0:
@@ -2547,7 +2547,7 @@ inline void ggml_cuda_op_rope(
2547
  const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
2548
  const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
2549
 
2550
- const float p = p0;
2551
 
2552
  // compute
2553
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
@@ -3136,7 +3136,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
3136
 
3137
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3138
  int nrows = ggml_nrows(tensor);
 
 
 
3139
  const size_t nb1 = tensor->nb[1];
 
3140
  ggml_backend backend = tensor->backend;
3141
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
3142
  memset(extra, 0, sizeof(*extra));
@@ -3165,11 +3169,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3165
  int64_t nrows_split = row_high - row_low;
3166
 
3167
  const size_t offset_split = row_low*nb1;
3168
- const size_t size = ggml_nbytes_split(tensor, nrows_split);
 
 
 
 
 
 
 
3169
 
3170
- void * buf;
3171
  CUDA_CHECK(cudaMalloc(&buf, size));
3172
- void * buf_host = (char*)data + offset_split;
 
 
 
 
 
 
3173
 
3174
  cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
3175
 
@@ -3211,36 +3228,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3211
  }
3212
 
3213
  // recursively assign CUDA buffers until a compute tensor is found
3214
- if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
3215
- const ggml_op src0_op = tensor->src0->op;
3216
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3217
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
3218
  }
3219
  }
3220
- if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
3221
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
3222
  }
3223
 
3224
  tensor->backend = GGML_BACKEND_GPU;
3225
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3226
  memset(extra, 0, sizeof(*extra));
3227
 
3228
- const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
3229
  tensor->op == GGML_OP_VIEW ||
3230
  force_inplace;
3231
  const size_t size = ggml_nbytes(tensor);
3232
 
3233
  CUDA_CHECK(cudaSetDevice(g_main_device));
3234
- if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
3235
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
3236
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3237
  size_t offset = 0;
3238
  if (tensor->op == GGML_OP_VIEW) {
3239
- memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
3240
  }
3241
  extra->data_device[g_main_device] = src0_ddc + offset;
3242
  } else if (tensor->op == GGML_OP_CPY) {
3243
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
3244
  void * src1_ddv = src1_extra->data_device[g_main_device];
3245
  extra->data_device[g_main_device] = src1_ddv;
3246
  } else if (scratch) {
@@ -3311,8 +3328,8 @@ void ggml_cuda_free_scratch() {
3311
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
3312
  ggml_cuda_func_t func;
3313
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
3314
- || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
3315
- || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
3316
 
3317
  switch (tensor->op) {
3318
  case GGML_OP_ADD:
@@ -3340,7 +3357,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3340
  func = ggml_cuda_rms_norm;
3341
  break;
3342
  case GGML_OP_MUL_MAT:
3343
- if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
3344
  return false;
3345
  }
3346
  func = ggml_cuda_mul_mat;
@@ -3394,6 +3411,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3394
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3395
  return true;
3396
  }
3397
- func(tensor->src0, tensor->src1, tensor);
3398
  return true;
3399
  }
 
59
  #endif //GGML_CUDA_DMMV_F16
60
 
61
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
+ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
63
+ typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
65
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
66
  typedef void (*ggml_cuda_op_t)(
 
131
  } block_q8_1;
132
  static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
133
 
134
+ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
135
 
136
  //================================= k-quants
137
 
 
208
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
209
 
210
  #define WARP_SIZE 32
211
+ #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
212
 
213
  #define CUDA_ADD_BLOCK_SIZE 256
214
  #define CUDA_MUL_BLOCK_SIZE 256
 
408
 
409
  //================================== k-quants
410
 
411
+ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
412
 
413
  const int i = blockIdx.x;
414
  const block_q2_K * x = (const block_q2_K *) vx;
 
441
 
442
  }
443
 
444
+ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
445
 
446
  const int i = blockIdx.x;
447
  const block_q3_K * x = (const block_q3_K *) vx;
 
505
  }
506
  #endif
507
 
508
+ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
509
  const block_q4_K * x = (const block_q4_K *) vx;
510
 
511
  const int i = blockIdx.x;
 
545
  #endif
546
  }
547
 
548
+ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
549
  const block_q5_K * x = (const block_q5_K *) vx;
550
 
551
  const int i = blockIdx.x;
 
591
  #endif
592
  }
593
 
594
+ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
595
  const block_q6_K * x = (const block_q6_K *) vx;
596
 
597
  const int i = blockIdx.x;
 
635
  #endif
636
  }
637
 
638
+ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
639
 
640
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
641
 
 
743
  }
744
  }
745
 
746
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
747
 
748
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
749
  if (row > nrows) return;
 
847
  }
848
  }
849
 
850
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
851
 
852
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
853
  if (row > nrows) return;
 
950
  }
951
  }
952
 
953
+ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
954
 
955
  const int row = blockIdx.x;
956
  const int num_blocks_per_row = ncols / QK_K;
 
1054
  }
1055
  }
1056
 
1057
+ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1058
 
1059
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
1060
 
 
1172
  v.y = x[ib + iqs + 1];
1173
  }
1174
 
1175
+ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
1176
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
1177
 
1178
  if (i >= k) {
 
1181
 
1182
  block_q8_1 * y = (block_q8_1 *) vy;
1183
 
1184
+ const int ib = i / QK8_1; // block index
1185
+ const int iqs = i % QK8_1; // quant index
1186
 
1187
+ const float xi = i < ndata ? x[i] : 0.0f;
1188
  float amax = fabsf(xi);
1189
  float sum = xi;
1190
 
 
1208
  }
1209
 
1210
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1211
+ static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1212
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1213
 
1214
  if (i >= k) {
 
1228
  y[iybs + iqs + y_offset] = v.y;
1229
  }
1230
 
1231
+ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1232
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1233
  const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1234
 
 
1253
  #endif // __CUDA_ARCH__ >= 600
1254
  }
1255
 
1256
+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1257
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1258
  const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1259
 
 
1278
  #endif // __CUDA_ARCH__ >= 600
1279
  }
1280
 
1281
+ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1282
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1283
  const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1284
 
 
1313
  #endif // __CUDA_ARCH__ >= 600
1314
  }
1315
 
1316
+ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1317
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1318
  const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
1319
 
 
1347
  #endif // __CUDA_ARCH__ >= 600
1348
  }
1349
 
1350
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1351
  #if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
1352
  const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1353
 
 
1367
  }
1368
 
1369
  template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
1370
+ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1371
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
1372
 
1373
  if (row >= nrows) {
 
1405
  }
1406
 
1407
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1408
+ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
1409
  // qk = quantized weights per x block
1410
  // qr = number of quantized weights per data value in x block
1411
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
 
1472
  }
1473
  }
1474
 
1475
+ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1476
  const half * x = (const half *) vx;
1477
 
1478
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
 
1519
  }
1520
 
1521
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1522
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1523
  const int row_stride_x, const int channel_stride_x) {
1524
 
1525
  const half * x = (const half *) vx;
 
1715
  rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
1716
  }
1717
 
1718
+ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
1719
  const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
1720
+ quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
1721
  }
1722
 
1723
  static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
 
2381
  src0->type == GGML_TYPE_Q5_1 ||
2382
  src0->type == GGML_TYPE_Q8_0;
2383
 
2384
+ const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
 
 
 
2385
  #endif
2386
 
2387
  if (use_mul_mat_vec_q) {
2388
+ int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2389
+ padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
2390
  size_t as;
2391
+ void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2392
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
2393
 
2394
  switch (src0->type) {
2395
  case GGML_TYPE_Q4_0:
 
2547
  const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
2548
  const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
2549
 
2550
+ const float p = get_ntk_rope_scale_mode()?p0:(n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx);
2551
 
2552
  // compute
2553
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
 
3136
 
3137
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3138
  int nrows = ggml_nrows(tensor);
3139
+
3140
+ const int64_t ne0 = tensor->ne[0];
3141
+
3142
  const size_t nb1 = tensor->nb[1];
3143
+
3144
  ggml_backend backend = tensor->backend;
3145
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
3146
  memset(extra, 0, sizeof(*extra));
 
3169
  int64_t nrows_split = row_high - row_low;
3170
 
3171
  const size_t offset_split = row_low*nb1;
3172
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
3173
+ const size_t original_size = size;
3174
+
3175
+ // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3176
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
3177
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3178
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
3179
+ }
3180
 
3181
+ char * buf;
3182
  CUDA_CHECK(cudaMalloc(&buf, size));
3183
+ char * buf_host = (char*)data + offset_split;
3184
+
3185
+ // set padding to 0 to avoid possible NaN values
3186
+ if (size > original_size) {
3187
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
3188
+ }
3189
+
3190
 
3191
  cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
3192
 
 
3228
  }
3229
 
3230
  // recursively assign CUDA buffers until a compute tensor is found
3231
+ if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
3232
+ const ggml_op src0_op = tensor->src[0]->op;
3233
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
3234
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
3235
  }
3236
  }
3237
+ if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
3238
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
3239
  }
3240
 
3241
  tensor->backend = GGML_BACKEND_GPU;
3242
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
3243
  memset(extra, 0, sizeof(*extra));
3244
 
3245
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
3246
  tensor->op == GGML_OP_VIEW ||
3247
  force_inplace;
3248
  const size_t size = ggml_nbytes(tensor);
3249
 
3250
  CUDA_CHECK(cudaSetDevice(g_main_device));
3251
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
3252
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
3253
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3254
  size_t offset = 0;
3255
  if (tensor->op == GGML_OP_VIEW) {
3256
+ memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3257
  }
3258
  extra->data_device[g_main_device] = src0_ddc + offset;
3259
  } else if (tensor->op == GGML_OP_CPY) {
3260
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
3261
  void * src1_ddv = src1_extra->data_device[g_main_device];
3262
  extra->data_device[g_main_device] = src1_ddv;
3263
  } else if (scratch) {
 
3328
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
3329
  ggml_cuda_func_t func;
3330
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
3331
+ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
3332
+ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
3333
 
3334
  switch (tensor->op) {
3335
  case GGML_OP_ADD:
 
3357
  func = ggml_cuda_rms_norm;
3358
  break;
3359
  case GGML_OP_MUL_MAT:
3360
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
3361
  return false;
3362
  }
3363
  func = ggml_cuda_mul_mat;
 
3411
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
3412
  return true;
3413
  }
3414
+ func(tensor->src[0], tensor->src[1], tensor);
3415
  return true;
3416
  }
ggml-metal.h CHANGED
@@ -34,9 +34,13 @@ extern "C" {
34
 
35
  struct ggml_metal_context;
36
 
37
- struct ggml_metal_context * ggml_metal_init(void);
 
38
  void ggml_metal_free(struct ggml_metal_context * ctx);
39
 
 
 
 
40
  // creates a mapping between a host memory buffer and a device memory buffer
41
  // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
42
  // - the mapping is used during computation to determine the arguments of the compute kernels
 
34
 
35
  struct ggml_metal_context;
36
 
37
+ // number of command buffers to use
38
+ struct ggml_metal_context * ggml_metal_init(int n_cb);
39
  void ggml_metal_free(struct ggml_metal_context * ctx);
40
 
41
+ // set the number of command buffers to use
42
+ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
43
+
44
  // creates a mapping between a host memory buffer and a device memory buffer
45
  // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
46
  // - the mapping is used during computation to determine the arguments of the compute kernels
ggml-metal.m CHANGED
@@ -25,6 +25,8 @@ struct ggml_metal_buffer {
25
  };
26
 
27
  struct ggml_metal_context {
 
 
28
  float * logits;
29
 
30
  id<MTLDevice> device;
@@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
86
  @implementation GGMLMetalClass
87
  @end
88
 
89
- struct ggml_metal_context * ggml_metal_init(void) {
90
  fprintf(stderr, "%s: allocating\n", __func__);
91
 
92
  struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
93
 
 
94
  ctx->device = MTLCreateSystemDefaultDevice();
95
  ctx->queue = [ctx->device newCommandQueue];
96
  ctx->n_buffers = 0;
@@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
208
  free(ctx);
209
  }
210
 
 
 
 
 
211
  // finds the Metal buffer that contains the tensor data on the GPU device
212
  // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
213
  // Metal buffer based on the host memory pointer
@@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
354
  // create multiple command buffers and enqueue them
355
  // then, we encode the graph into the command buffers in parallel
356
 
357
- const int n_cb = gf->n_threads;
358
 
359
  NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
360
 
@@ -386,8 +393,8 @@ void ggml_metal_graph_compute(
386
  for (int i = node_start; i < node_end; ++i) {
387
  metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
388
 
389
- struct ggml_tensor * src0 = gf->nodes[i]->src0;
390
- struct ggml_tensor * src1 = gf->nodes[i]->src1;
391
  struct ggml_tensor * dst = gf->nodes[i];
392
 
393
  const int64_t ne00 = src0 ? src0->ne[0] : 0;
@@ -443,6 +450,7 @@ void ggml_metal_graph_compute(
443
  //}
444
 
445
  switch (dst->op) {
 
446
  case GGML_OP_RESHAPE:
447
  case GGML_OP_VIEW:
448
  case GGML_OP_TRANSPOSE:
 
25
  };
26
 
27
  struct ggml_metal_context {
28
+ int n_cb;
29
+
30
  float * logits;
31
 
32
  id<MTLDevice> device;
 
88
  @implementation GGMLMetalClass
89
  @end
90
 
91
+ struct ggml_metal_context * ggml_metal_init(int n_cb) {
92
  fprintf(stderr, "%s: allocating\n", __func__);
93
 
94
  struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
95
 
96
+ ctx->n_cb = n_cb;
97
  ctx->device = MTLCreateSystemDefaultDevice();
98
  ctx->queue = [ctx->device newCommandQueue];
99
  ctx->n_buffers = 0;
 
211
  free(ctx);
212
  }
213
 
214
+ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
215
+ ctx->n_cb = n_cb;
216
+ }
217
+
218
  // finds the Metal buffer that contains the tensor data on the GPU device
219
  // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
220
  // Metal buffer based on the host memory pointer
 
361
  // create multiple command buffers and enqueue them
362
  // then, we encode the graph into the command buffers in parallel
363
 
364
+ const int n_cb = ctx->n_cb;
365
 
366
  NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
367
 
 
393
  for (int i = node_start; i < node_end; ++i) {
394
  metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
395
 
396
+ struct ggml_tensor * src0 = gf->nodes[i]->src[0];
397
+ struct ggml_tensor * src1 = gf->nodes[i]->src[1];
398
  struct ggml_tensor * dst = gf->nodes[i];
399
 
400
  const int64_t ne00 = src0 ? src0->ne[0] : 0;
 
450
  //}
451
 
452
  switch (dst->op) {
453
+ case GGML_OP_NONE:
454
  case GGML_OP_RESHAPE:
455
  case GGML_OP_VIEW:
456
  case GGML_OP_TRANSPOSE:
ggml-mpi.c ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-mpi.h"
2
+
3
+ #include "ggml.h"
4
+
5
+ #include <mpi.h>
6
+
7
+ #include <stdio.h>
8
+ #include <stdlib.h>
9
+
10
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
11
+
12
+ #define UNUSED GGML_UNUSED
13
+
14
+ struct ggml_mpi_context {
15
+ int rank;
16
+ int size;
17
+ };
18
+
19
+ void ggml_mpi_backend_init(void) {
20
+ MPI_Init(NULL, NULL);
21
+ }
22
+
23
+ void ggml_mpi_backend_free(void) {
24
+ MPI_Finalize();
25
+ }
26
+
27
+ struct ggml_mpi_context * ggml_mpi_init(void) {
28
+ struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
29
+
30
+ MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
31
+ MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
32
+
33
+ return ctx;
34
+ }
35
+
36
+ void ggml_mpi_free(struct ggml_mpi_context * ctx) {
37
+ free(ctx);
38
+ }
39
+
40
+ int ggml_mpi_rank(struct ggml_mpi_context * ctx) {
41
+ return ctx->rank;
42
+ }
43
+
44
+ void ggml_mpi_eval_init(
45
+ struct ggml_mpi_context * ctx_mpi,
46
+ int * n_tokens,
47
+ int * n_past,
48
+ int * n_threads) {
49
+ UNUSED(ctx_mpi);
50
+
51
+ // synchronize the worker node parameters with the root node
52
+ MPI_Barrier(MPI_COMM_WORLD);
53
+
54
+ MPI_Bcast(n_tokens, 1, MPI_INT, 0, MPI_COMM_WORLD);
55
+ MPI_Bcast(n_past, 1, MPI_INT, 0, MPI_COMM_WORLD);
56
+ MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
57
+ }
58
+
59
+ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
60
+ struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
61
+ if (t == NULL) {
62
+ fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
63
+ return -1;
64
+ }
65
+
66
+ for (int i = 0; i < gf->n_nodes; i++) {
67
+ if (gf->nodes[i] == t) {
68
+ return i;
69
+ }
70
+ }
71
+
72
+ fprintf(stderr, "%s: tensor %s not found in graph (should not happen)\n", __func__, name);
73
+ return -1;
74
+ }
75
+
76
+ static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
77
+ MPI_Datatype mpi_type;
78
+
79
+ switch (t->type) {
80
+ case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
81
+ case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
82
+ default: GGML_ASSERT(false && "not implemented");
83
+ }
84
+
85
+ const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
86
+ GGML_ASSERT(retval == MPI_SUCCESS);
87
+ }
88
+
89
+ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
90
+ MPI_Datatype mpi_type;
91
+
92
+ switch (t->type) {
93
+ case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
94
+ case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
95
+ default: GGML_ASSERT(false && "not implemented");
96
+ }
97
+
98
+ MPI_Status status; UNUSED(status);
99
+
100
+ const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
101
+ GGML_ASSERT(retval == MPI_SUCCESS);
102
+ }
103
+
104
+ // TODO: there are many improvements that can be done to this implementation
105
+ void ggml_mpi_graph_compute_pre(
106
+ struct ggml_mpi_context * ctx_mpi,
107
+ struct ggml_cgraph * gf,
108
+ int n_layers) {
109
+ const int mpi_rank = ctx_mpi->rank;
110
+ const int mpi_size = ctx_mpi->size;
111
+
112
+ struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
113
+ if (inp_tokens == NULL) {
114
+ fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
115
+ return;
116
+ }
117
+
118
+ struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
119
+ if (inp0 == NULL) {
120
+ fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
121
+ return;
122
+ }
123
+
124
+ GGML_ASSERT(inp0 == gf->nodes[0]);
125
+
126
+ // distribute the compute graph into slices across the MPI nodes
127
+ //
128
+ // the main node (0) processes the last layers + the remainder of the compute graph
129
+ // and is responsible to pass the input tokens to the first node (1)
130
+ //
131
+ // node 1: [( 0) * n_per_node, ( 1) * n_per_node)
132
+ // node 2: [( 1) * n_per_node, ( 2) * n_per_node)
133
+ // ...
134
+ // node n-1: [(n-2) * n_per_node, (n-1) * n_per_node)
135
+ // node 0: [(n-1) * n_per_node, n_nodes)
136
+ //
137
+ if (mpi_rank > 0) {
138
+ if (mpi_rank == 1) {
139
+ // the first node (1) receives the input tokens from the main node (0)
140
+ ggml_mpi_tensor_recv(inp_tokens, 0);
141
+ } else {
142
+ // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
143
+ ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
144
+ }
145
+ } else if (mpi_size > 1) {
146
+ // node 0 sends the input tokens to node 1
147
+ ggml_mpi_tensor_send(inp_tokens, 1);
148
+
149
+ // recv the output data from the last node
150
+ ggml_mpi_tensor_recv(inp0, mpi_size - 1);
151
+ }
152
+
153
+ {
154
+ const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
155
+
156
+ const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
157
+
158
+ const int il0 = (mpi_idx + 0) * n_per_node;
159
+ const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
160
+
161
+ char name_l0[GGML_MAX_NAME];
162
+ char name_l1[GGML_MAX_NAME];
163
+
164
+ snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
165
+ snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
166
+
167
+ const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0);
168
+ const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
169
+
170
+ if (idx_l0 < 0 || idx_l1 < 0) {
171
+ fprintf(stderr, "%s: layer input nodes not found\n", __func__);
172
+ return;
173
+ }
174
+
175
+ // attach the input data to all nodes that need it
176
+ // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
177
+ for (int i = idx_l0; i < idx_l1; i++) {
178
+ if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
179
+ gf->nodes[i]->src[0] = inp0;
180
+ }
181
+ if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
182
+ gf->nodes[i]->src[1] = inp0;
183
+ }
184
+ }
185
+
186
+ // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
187
+ for (int i = 1; i < idx_l1 - idx_l0; i++) {
188
+ gf->nodes[i] = gf->nodes[idx_l0 + i];
189
+ gf->grads[i] = gf->grads[idx_l0 + i];
190
+ }
191
+
192
+ // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
193
+ if (mpi_idx != 0) {
194
+ gf->nodes[0]->op = GGML_OP_NONE;
195
+ }
196
+
197
+ gf->n_nodes = idx_l1 - idx_l0;
198
+
199
+ //fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
200
+ }
201
+ }
202
+
203
+ void ggml_mpi_graph_compute_post(
204
+ struct ggml_mpi_context * ctx_mpi,
205
+ struct ggml_cgraph * gf,
206
+ int n_layers) {
207
+ UNUSED(n_layers);
208
+
209
+ const int mpi_rank = ctx_mpi->rank;
210
+ const int mpi_size = ctx_mpi->size;
211
+
212
+ // send the output data to the next node
213
+ if (mpi_rank > 0) {
214
+ ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
215
+ }
216
+ }
ggml-mpi.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ struct ggml_context;
4
+ struct ggml_tensor;
5
+ struct ggml_cgraph;
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ struct ggml_mpi_context;
12
+
13
+ void ggml_mpi_backend_init(void);
14
+ void ggml_mpi_backend_free(void);
15
+
16
+ struct ggml_mpi_context * ggml_mpi_init(void);
17
+ void ggml_mpi_free(struct ggml_mpi_context * ctx);
18
+
19
+ int ggml_mpi_rank(struct ggml_mpi_context * ctx);
20
+
21
+ void ggml_mpi_eval_init(
22
+ struct ggml_mpi_context * ctx_mpi,
23
+ int * n_tokens,
24
+ int * n_past,
25
+ int * n_threads);
26
+
27
+ void ggml_mpi_graph_compute_pre(
28
+ struct ggml_mpi_context * ctx_mpi,
29
+ struct ggml_cgraph * gf,
30
+ int n_layers);
31
+
32
+ void ggml_mpi_graph_compute_post(
33
+ struct ggml_mpi_context * ctx_mpi,
34
+ struct ggml_cgraph * gf,
35
+ int n_layers);
36
+
37
+ #ifdef __cplusplus
38
+ }
39
+ #endif
ggml.c CHANGED
@@ -247,7 +247,11 @@ inline static void* ggml_aligned_malloc(size_t size) {
247
  #include "ggml-opencl.h"
248
  #endif
249
  #elif defined(GGML_USE_OPENBLAS)
 
 
 
250
  #include <cblas.h>
 
251
  #elif defined(GGML_USE_CUBLAS)
252
  #include "ggml-cuda.h"
253
  #elif defined(GGML_USE_CLBLAST)
@@ -4280,20 +4284,33 @@ static inline int ggml_up(int n, int m) {
4280
  #define ggml_assert_aligned(ptr) \
4281
  GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
4282
 
 
 
 
 
 
 
 
 
 
4283
  float get_theta_scale(int n_dims,int n_past,int n_ctx)
4284
  {
4285
- if(n_ctx<=2048) //normie mode
4286
- {
4287
- return powf(10000.0, -2.0f/n_dims);
4288
- }
4289
- else
4290
- {
4291
- //using scaled NTK aware ctx
4292
- float a = (n_ctx<=4096?4.0:8.0);
4293
- float m = powf(a, n_dims / (n_dims - 2.0));
4294
- float s = powf(10000.0 * m, -2.0f/n_dims);
4295
- return s;
4296
- }
 
 
 
 
4297
  }
4298
 
4299
  ////////////////////////////////////////////////////////////////////////////////
@@ -4597,17 +4614,14 @@ struct ggml_tensor * ggml_new_tensor_impl(
4597
  /*.op =*/ GGML_OP_NONE,
4598
  /*.is_param =*/ false,
4599
  /*.grad =*/ NULL,
4600
- /*.src0 =*/ NULL,
4601
- /*.src1 =*/ NULL,
4602
- /*.opt =*/ { NULL },
4603
- /*.n_tasks =*/ 0,
4604
  /*.perf_runs =*/ 0,
4605
  /*.perf_cycles =*/ 0,
4606
  /*.perf_time_us =*/ 0,
4607
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4608
  /*.name =*/ { 0 },
4609
  /*.extra =*/ NULL,
4610
- /*.pad =*/ { 0 },
4611
  };
4612
 
4613
  // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
@@ -5026,8 +5040,8 @@ struct ggml_tensor * ggml_dup_impl(
5026
 
5027
  result->op = GGML_OP_DUP;
5028
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5029
- result->src0 = a;
5030
- result->src1 = NULL;
5031
 
5032
  return result;
5033
  }
@@ -5063,8 +5077,8 @@ struct ggml_tensor * ggml_add_impl(
5063
 
5064
  result->op = GGML_OP_ADD;
5065
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5066
- result->src0 = a;
5067
- result->src1 = b;
5068
 
5069
  return result;
5070
  }
@@ -5103,8 +5117,8 @@ struct ggml_tensor * ggml_add1_impl(
5103
 
5104
  result->op = GGML_OP_ADD1;
5105
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5106
- result->src0 = a;
5107
- result->src1 = b;
5108
 
5109
  return result;
5110
  }
@@ -5161,9 +5175,9 @@ struct ggml_tensor * ggml_acc_impl(
5161
 
5162
  result->op = GGML_OP_ACC;
5163
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5164
- result->src0 = a;
5165
- result->src1 = b;
5166
- result->opt[0] = c;
5167
 
5168
  return result;
5169
  }
@@ -5209,8 +5223,8 @@ struct ggml_tensor * ggml_sub_impl(
5209
 
5210
  result->op = GGML_OP_SUB;
5211
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5212
- result->src0 = a;
5213
- result->src1 = b;
5214
 
5215
  return result;
5216
  }
@@ -5256,8 +5270,8 @@ struct ggml_tensor * ggml_mul_impl(
5256
 
5257
  result->op = GGML_OP_MUL;
5258
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5259
- result->src0 = a;
5260
- result->src1 = b;
5261
 
5262
  return result;
5263
  }
@@ -5299,8 +5313,8 @@ struct ggml_tensor * ggml_div_impl(
5299
 
5300
  result->op = GGML_OP_DIV;
5301
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5302
- result->src0 = a;
5303
- result->src1 = b;
5304
 
5305
  return result;
5306
  }
@@ -5335,8 +5349,8 @@ struct ggml_tensor * ggml_sqr_impl(
5335
 
5336
  result->op = GGML_OP_SQR;
5337
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5338
- result->src0 = a;
5339
- result->src1 = NULL;
5340
 
5341
  return result;
5342
  }
@@ -5369,8 +5383,8 @@ struct ggml_tensor * ggml_sqrt_impl(
5369
 
5370
  result->op = GGML_OP_SQRT;
5371
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5372
- result->src0 = a;
5373
- result->src1 = NULL;
5374
 
5375
  return result;
5376
  }
@@ -5404,8 +5418,8 @@ struct ggml_tensor * ggml_log_impl(
5404
 
5405
  result->op = GGML_OP_LOG;
5406
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5407
- result->src0 = a;
5408
- result->src1 = NULL;
5409
 
5410
  return result;
5411
  }
@@ -5437,8 +5451,8 @@ struct ggml_tensor * ggml_sum(
5437
 
5438
  result->op = GGML_OP_SUM;
5439
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5440
- result->src0 = a;
5441
- result->src1 = NULL;
5442
 
5443
  return result;
5444
  }
@@ -5464,8 +5478,8 @@ struct ggml_tensor * ggml_sum_rows(
5464
 
5465
  result->op = GGML_OP_SUM_ROWS;
5466
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5467
- result->src0 = a;
5468
- result->src1 = NULL;
5469
 
5470
  return result;
5471
  }
@@ -5487,8 +5501,8 @@ struct ggml_tensor * ggml_mean(
5487
 
5488
  result->op = GGML_OP_MEAN;
5489
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5490
- result->src0 = a;
5491
- result->src1 = NULL;
5492
 
5493
  return result;
5494
  }
@@ -5511,8 +5525,8 @@ struct ggml_tensor * ggml_argmax(
5511
 
5512
  result->op = GGML_OP_ARGMAX;
5513
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5514
- result->src0 = a;
5515
- result->src1 = NULL;
5516
 
5517
  return result;
5518
  }
@@ -5539,8 +5553,8 @@ struct ggml_tensor * ggml_repeat(
5539
 
5540
  result->op = GGML_OP_REPEAT;
5541
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5542
- result->src0 = a;
5543
- result->src1 = b;
5544
 
5545
  return result;
5546
  }
@@ -5567,8 +5581,8 @@ struct ggml_tensor * ggml_repeat_back(
5567
 
5568
  result->op = GGML_OP_REPEAT_BACK;
5569
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5570
- result->src0 = a;
5571
- result->src1 = b;
5572
 
5573
  return result;
5574
  }
@@ -5589,8 +5603,8 @@ struct ggml_tensor * ggml_abs_impl(
5589
 
5590
  result->op = GGML_OP_ABS;
5591
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5592
- result->src0 = a;
5593
- result->src1 = NULL;
5594
 
5595
  return result;
5596
  }
@@ -5624,8 +5638,8 @@ struct ggml_tensor * ggml_sgn_impl(
5624
 
5625
  result->op = GGML_OP_SGN;
5626
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5627
- result->src0 = a;
5628
- result->src1 = NULL;
5629
 
5630
  return result;
5631
  }
@@ -5658,8 +5672,8 @@ struct ggml_tensor * ggml_neg_impl(
5658
 
5659
  result->op = GGML_OP_NEG;
5660
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5661
- result->src0 = a;
5662
- result->src1 = NULL;
5663
 
5664
  return result;
5665
  }
@@ -5692,8 +5706,8 @@ struct ggml_tensor * ggml_step_impl(
5692
 
5693
  result->op = GGML_OP_STEP;
5694
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5695
- result->src0 = a;
5696
- result->src1 = NULL;
5697
 
5698
  return result;
5699
  }
@@ -5726,8 +5740,8 @@ struct ggml_tensor * ggml_tanh_impl(
5726
 
5727
  result->op = GGML_OP_TANH;
5728
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5729
- result->src0 = a;
5730
- result->src1 = NULL;
5731
 
5732
  return result;
5733
  }
@@ -5760,8 +5774,8 @@ struct ggml_tensor * ggml_elu_impl(
5760
 
5761
  result->op = GGML_OP_ELU;
5762
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5763
- result->src0 = a;
5764
- result->src1 = NULL;
5765
 
5766
  return result;
5767
  }
@@ -5794,8 +5808,8 @@ struct ggml_tensor * ggml_relu_impl(
5794
 
5795
  result->op = GGML_OP_RELU;
5796
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5797
- result->src0 = a;
5798
- result->src1 = NULL;
5799
 
5800
  return result;
5801
  }
@@ -5828,8 +5842,8 @@ struct ggml_tensor * ggml_gelu_impl(
5828
 
5829
  result->op = GGML_OP_GELU;
5830
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5831
- result->src0 = a;
5832
- result->src1 = NULL;
5833
 
5834
  return result;
5835
  }
@@ -5862,8 +5876,8 @@ struct ggml_tensor * ggml_gelu_quick_impl(
5862
 
5863
  result->op = GGML_OP_GELU_QUICK;
5864
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5865
- result->src0 = a;
5866
- result->src1 = NULL;
5867
 
5868
  return result;
5869
  }
@@ -5896,8 +5910,8 @@ struct ggml_tensor * ggml_silu_impl(
5896
 
5897
  result->op = GGML_OP_SILU;
5898
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5899
- result->src0 = a;
5900
- result->src1 = NULL;
5901
 
5902
  return result;
5903
  }
@@ -5931,8 +5945,8 @@ struct ggml_tensor * ggml_silu_back(
5931
 
5932
  result->op = GGML_OP_SILU_BACK;
5933
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5934
- result->src0 = a;
5935
- result->src1 = b;
5936
 
5937
  return result;
5938
  }
@@ -5954,8 +5968,8 @@ struct ggml_tensor * ggml_norm_impl(
5954
 
5955
  result->op = GGML_OP_NORM;
5956
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5957
- result->src0 = a;
5958
- result->src1 = NULL; // TODO: maybe store epsilon here?
5959
 
5960
  return result;
5961
  }
@@ -5986,8 +6000,8 @@ struct ggml_tensor * ggml_rms_norm_impl(
5986
 
5987
  result->op = GGML_OP_RMS_NORM;
5988
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5989
- result->src0 = a;
5990
- result->src1 = NULL; // TODO: maybe store epsilon here?
5991
 
5992
  return result;
5993
  }
@@ -6019,8 +6033,8 @@ struct ggml_tensor * ggml_rms_norm_back(
6019
 
6020
  result->op = GGML_OP_RMS_NORM_BACK;
6021
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6022
- result->src0 = a;
6023
- result->src1 = b;
6024
 
6025
  return result;
6026
  }
@@ -6046,8 +6060,8 @@ struct ggml_tensor * ggml_mul_mat(
6046
 
6047
  result->op = GGML_OP_MUL_MAT;
6048
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6049
- result->src0 = a;
6050
- result->src1 = b;
6051
 
6052
  return result;
6053
  }
@@ -6072,8 +6086,8 @@ struct ggml_tensor * ggml_out_prod(
6072
 
6073
  result->op = GGML_OP_OUT_PROD;
6074
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6075
- result->src0 = a;
6076
- result->src1 = b;
6077
 
6078
  return result;
6079
  }
@@ -6098,8 +6112,8 @@ struct ggml_tensor * ggml_scale_impl(
6098
 
6099
  result->op = GGML_OP_SCALE;
6100
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6101
- result->src0 = a;
6102
- result->src1 = b;
6103
 
6104
  return result;
6105
  }
@@ -6154,9 +6168,9 @@ struct ggml_tensor * ggml_set_impl(
6154
 
6155
  result->op = GGML_OP_SET;
6156
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6157
- result->src0 = a;
6158
- result->src1 = b;
6159
- result->opt[0] = c;
6160
 
6161
  return result;
6162
  }
@@ -6243,8 +6257,8 @@ struct ggml_tensor * ggml_cpy_impl(
6243
 
6244
  result->op = GGML_OP_CPY;
6245
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6246
- result->src0 = a;
6247
- result->src1 = b;
6248
 
6249
  return result;
6250
  }
@@ -6280,8 +6294,8 @@ struct ggml_tensor * ggml_cont_impl(
6280
 
6281
  result->op = GGML_OP_CONT;
6282
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6283
- result->src0 = a;
6284
- result->src1 = NULL;
6285
 
6286
  return result;
6287
  }
@@ -6324,8 +6338,8 @@ struct ggml_tensor * ggml_reshape(
6324
 
6325
  result->op = GGML_OP_RESHAPE;
6326
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6327
- result->src0 = a;
6328
- result->src1 = NULL;
6329
 
6330
  return result;
6331
  }
@@ -6349,8 +6363,8 @@ struct ggml_tensor * ggml_reshape_1d(
6349
 
6350
  result->op = GGML_OP_RESHAPE;
6351
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6352
- result->src0 = a;
6353
- result->src1 = NULL;
6354
 
6355
  return result;
6356
  }
@@ -6375,8 +6389,8 @@ struct ggml_tensor * ggml_reshape_2d(
6375
 
6376
  result->op = GGML_OP_RESHAPE;
6377
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6378
- result->src0 = a;
6379
- result->src1 = NULL;
6380
 
6381
  return result;
6382
  }
@@ -6402,8 +6416,8 @@ struct ggml_tensor * ggml_reshape_3d(
6402
 
6403
  result->op = GGML_OP_RESHAPE;
6404
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6405
- result->src0 = a;
6406
- result->src1 = NULL;
6407
 
6408
  return result;
6409
  }
@@ -6431,8 +6445,8 @@ struct ggml_tensor * ggml_reshape_4d(
6431
 
6432
  result->op = GGML_OP_RESHAPE;
6433
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6434
- result->src0 = a;
6435
- result->src1 = NULL;
6436
 
6437
  return result;
6438
  }
@@ -6464,9 +6478,9 @@ struct ggml_tensor * ggml_view_1d(
6464
 
6465
  result->op = GGML_OP_VIEW;
6466
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6467
- result->src0 = a;
6468
- result->src1 = NULL;
6469
- result->opt[0] = offs;
6470
 
6471
  return result;
6472
  }
@@ -6506,9 +6520,9 @@ struct ggml_tensor * ggml_view_2d(
6506
 
6507
  result->op = GGML_OP_VIEW;
6508
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6509
- result->src0 = a;
6510
- result->src1 = NULL;
6511
- result->opt[0] = offs;
6512
 
6513
  return result;
6514
  }
@@ -6550,9 +6564,9 @@ struct ggml_tensor * ggml_view_3d(
6550
 
6551
  result->op = GGML_OP_VIEW;
6552
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6553
- result->src0 = a;
6554
- result->src1 = NULL;
6555
- result->opt[0] = offs;
6556
 
6557
  return result;
6558
  }
@@ -6596,9 +6610,9 @@ struct ggml_tensor * ggml_view_4d(
6596
 
6597
  result->op = GGML_OP_VIEW;
6598
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6599
- result->src0 = a;
6600
- result->src1 = NULL;
6601
- result->opt[0] = offs;
6602
 
6603
  return result;
6604
  }
@@ -6658,8 +6672,8 @@ struct ggml_tensor * ggml_permute(
6658
 
6659
  result->op = GGML_OP_PERMUTE;
6660
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6661
- result->src0 = a;
6662
- result->src1 = NULL;
6663
 
6664
  if (is_node) {
6665
  ggml_scratch_save(ctx);
@@ -6673,7 +6687,7 @@ struct ggml_tensor * ggml_permute(
6673
 
6674
  ggml_scratch_load(ctx);
6675
 
6676
- result->opt[0] = b;
6677
  }
6678
 
6679
  return result;
@@ -6701,8 +6715,8 @@ struct ggml_tensor * ggml_transpose(
6701
 
6702
  result->op = GGML_OP_TRANSPOSE;
6703
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6704
- result->src0 = a;
6705
- result->src1 = NULL;
6706
 
6707
  return result;
6708
  }
@@ -6727,8 +6741,8 @@ struct ggml_tensor * ggml_get_rows(
6727
 
6728
  result->op = GGML_OP_GET_ROWS;
6729
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6730
- result->src0 = a;
6731
- result->src1 = b;
6732
 
6733
  return result;
6734
  }
@@ -6755,9 +6769,9 @@ struct ggml_tensor * ggml_get_rows_back(
6755
 
6756
  result->op = GGML_OP_GET_ROWS_BACK;
6757
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6758
- result->src0 = a;
6759
- result->src1 = b;
6760
- result->opt[0] = c;
6761
 
6762
  return result;
6763
  }
@@ -6779,8 +6793,8 @@ struct ggml_tensor * ggml_diag(
6779
 
6780
  result->op = GGML_OP_DIAG;
6781
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6782
- result->src0 = a;
6783
- result->src1 = NULL;
6784
 
6785
  return result;
6786
  }
@@ -6812,8 +6826,8 @@ struct ggml_tensor * ggml_diag_mask_inf_impl(
6812
 
6813
  result->op = GGML_OP_DIAG_MASK_INF;
6814
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6815
- result->src0 = a;
6816
- result->src1 = b;
6817
 
6818
  return result;
6819
  }
@@ -6860,8 +6874,8 @@ struct ggml_tensor * ggml_diag_mask_zero_impl(
6860
 
6861
  result->op = GGML_OP_DIAG_MASK_ZERO;
6862
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6863
- result->src0 = a;
6864
- result->src1 = b;
6865
 
6866
  return result;
6867
  }
@@ -6896,8 +6910,8 @@ struct ggml_tensor * ggml_soft_max_impl(
6896
 
6897
  result->op = GGML_OP_SOFT_MAX;
6898
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6899
- result->src0 = a;
6900
- result->src1 = NULL;
6901
 
6902
  return result;
6903
  }
@@ -6932,8 +6946,8 @@ struct ggml_tensor * ggml_soft_max_back_impl(
6932
 
6933
  result->op = GGML_OP_SOFT_MAX_BACK;
6934
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6935
- result->src0 = a;
6936
- result->src1 = b;
6937
 
6938
  return result;
6939
  }
@@ -6984,8 +6998,8 @@ struct ggml_tensor * ggml_rope_impl(
6984
 
6985
  result->op = GGML_OP_ROPE;
6986
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6987
- result->src0 = a;
6988
- result->src1 = b;
6989
 
6990
  return result;
6991
  }
@@ -7042,8 +7056,8 @@ struct ggml_tensor * ggml_rope_back(
7042
 
7043
  result->op = GGML_OP_ROPE_BACK;
7044
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7045
- result->src0 = a;
7046
- result->src1 = b;
7047
 
7048
  return result;
7049
  }
@@ -7081,8 +7095,8 @@ struct ggml_tensor * ggml_alibi(
7081
 
7082
  result->op = GGML_OP_ALIBI;
7083
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7084
- result->src0 = a;
7085
- result->src1 = b;
7086
 
7087
  return result;
7088
  }
@@ -7115,8 +7129,8 @@ struct ggml_tensor * ggml_clamp(
7115
 
7116
  result->op = GGML_OP_CLAMP;
7117
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7118
- result->src0 = a;
7119
- result->src1 = b;
7120
 
7121
  return result;
7122
  }
@@ -7158,9 +7172,9 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
7158
 
7159
  result->op = GGML_OP_CONV_1D;
7160
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7161
- result->src0 = a;
7162
- result->src1 = b;
7163
- result->opt[0] = c;
7164
 
7165
  return result;
7166
  }
@@ -7206,9 +7220,9 @@ struct ggml_tensor* ggml_conv_2d(
7206
 
7207
  result->op = GGML_OP_CONV_2D;
7208
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7209
- result->src0 = a;
7210
- result->src1 = b;
7211
- result->opt[0] = c;
7212
 
7213
  return result;
7214
 
@@ -7247,10 +7261,10 @@ struct ggml_tensor * ggml_flash_attn(
7247
 
7248
  result->op = GGML_OP_FLASH_ATTN;
7249
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7250
- result->src0 = q;
7251
- result->src1 = k;
7252
- result->opt[0] = v;
7253
- result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0);
7254
 
7255
  return result;
7256
  }
@@ -7278,11 +7292,11 @@ struct ggml_tensor * ggml_flash_ff(
7278
 
7279
  result->op = GGML_OP_FLASH_FF;
7280
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7281
- result->src0 = a;
7282
- result->src1 = b0;
7283
- result->opt[0] = b1;
7284
- result->opt[1] = c0;
7285
- result->opt[2] = c1;
7286
 
7287
  return result;
7288
  }
@@ -7342,11 +7356,11 @@ struct ggml_tensor * ggml_flash_attn_back(
7342
 
7343
  result->op = GGML_OP_FLASH_ATTN_BACK;
7344
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7345
- result->src0 = q;
7346
- result->src1 = k;
7347
- result->opt[0] = v;
7348
- result->opt[1] = d;
7349
- result->opt[2] = ggml_new_i32(ctx, masked ? 1 : 0);
7350
 
7351
  return result;
7352
  }
@@ -7391,9 +7405,9 @@ struct ggml_tensor * ggml_win_part(
7391
 
7392
  result->op = GGML_OP_WIN_PART;
7393
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7394
- result->src0 = a;
7395
- result->src1 = NULL;
7396
- result->opt[0] = b;
7397
 
7398
  return result;
7399
  }
@@ -7428,9 +7442,9 @@ struct ggml_tensor * ggml_win_unpart(
7428
 
7429
  result->op = GGML_OP_WIN_UNPART;
7430
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7431
- result->src0 = a;
7432
- result->src1 = NULL;
7433
- result->opt[0] = b;
7434
 
7435
  return result;
7436
  }
@@ -7459,8 +7473,8 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7459
 
7460
  result->op = GGML_OP_MAP_UNARY;
7461
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7462
- result->src0 = a;
7463
- result->opt[0] = addr_tensor;
7464
 
7465
  return result;
7466
  }
@@ -7506,9 +7520,9 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7506
 
7507
  result->op = GGML_OP_MAP_BINARY;
7508
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7509
- result->src0 = a;
7510
- result->src1 = b;
7511
- result->opt[0] = addr_tensor;
7512
 
7513
  return result;
7514
  }
@@ -7553,8 +7567,8 @@ struct ggml_tensor * ggml_map_custom1_impl_f32(
7553
 
7554
  result->op = GGML_OP_MAP_CUSTOM1;
7555
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7556
- result->src0 = a;
7557
- result->opt[0] = addr_tensor;
7558
 
7559
  return result;
7560
  }
@@ -7598,9 +7612,9 @@ struct ggml_tensor * ggml_map_custom2_impl_f32(
7598
 
7599
  result->op = GGML_OP_MAP_CUSTOM2;
7600
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7601
- result->src0 = a;
7602
- result->src1 = b;
7603
- result->opt[0] = addr_tensor;
7604
 
7605
  return result;
7606
  }
@@ -7647,10 +7661,10 @@ struct ggml_tensor * ggml_map_custom3_impl_f32(
7647
 
7648
  result->op = GGML_OP_MAP_CUSTOM3;
7649
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7650
- result->src0 = a;
7651
- result->src1 = b;
7652
- result->opt[0] = addr_tensor;
7653
- result->opt[1] = c;
7654
 
7655
  return result;
7656
  }
@@ -7690,8 +7704,8 @@ struct ggml_tensor * ggml_cross_entropy_loss(
7690
 
7691
  result->op = GGML_OP_CROSS_ENTROPY_LOSS;
7692
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7693
- result->src0 = a;
7694
- result->src1 = b;
7695
 
7696
  return result;
7697
  }
@@ -7710,9 +7724,9 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
7710
 
7711
  result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
7712
  result->grad = NULL;
7713
- result->src0 = a;
7714
- result->src1 = b;
7715
- result->opt[0] = c;
7716
 
7717
  return result;
7718
  }
@@ -10735,8 +10749,6 @@ static void ggml_compute_forward_mul_mat(
10735
 
10736
  float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
10737
 
10738
- assert(ne00 % 32 == 0);
10739
-
10740
  for (int64_t ic = 0; ic < ne11; ++ic) {
10741
  vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
10742
  }
@@ -12043,7 +12055,9 @@ static void ggml_compute_forward_rope_f32(
12043
  dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
12044
  }
12045
  } else if (!is_neox) {
12046
-
 
 
12047
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12048
  const float cos_theta = cosf(theta);
12049
  const float sin_theta = sinf(theta);
@@ -12171,6 +12185,9 @@ static void ggml_compute_forward_rope_f16(
12171
  dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
12172
  }
12173
  } if (!is_neox) {
 
 
 
12174
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12175
  const float cos_theta = cosf(theta);
12176
  const float sin_theta = sinf(theta);
@@ -12296,6 +12313,9 @@ static void ggml_compute_forward_rope_back_f32(
12296
  float theta = (float)p;
12297
 
12298
  if (!is_neox) {
 
 
 
12299
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12300
  const float cos_theta = cosf(theta);
12301
  const float sin_theta = sinf(theta);
@@ -12396,6 +12416,9 @@ static void ggml_compute_forward_rope_back_f16(
12396
  float theta = (float)p;
12397
 
12398
  if (!is_neox) {
 
 
 
12399
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12400
  const float cos_theta = cosf(theta);
12401
  const float sin_theta = sinf(theta);
@@ -14586,287 +14609,287 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14586
  if (skip_cpu) {
14587
  return;
14588
  }
14589
- GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
14590
- GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
14591
  #endif // GGML_USE_CUBLAS
14592
 
14593
  switch (tensor->op) {
14594
  case GGML_OP_DUP:
14595
  {
14596
- ggml_compute_forward_dup(params, tensor->src0, tensor);
14597
  } break;
14598
  case GGML_OP_ADD:
14599
  {
14600
- ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
14601
  } break;
14602
  case GGML_OP_ADD1:
14603
  {
14604
- ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor);
14605
  } break;
14606
  case GGML_OP_ACC:
14607
  {
14608
- ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
14609
  } break;
14610
  case GGML_OP_SUB:
14611
  {
14612
- ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);
14613
  } break;
14614
  case GGML_OP_MUL:
14615
  {
14616
- ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor);
14617
  } break;
14618
  case GGML_OP_DIV:
14619
  {
14620
- ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor);
14621
  } break;
14622
  case GGML_OP_SQR:
14623
  {
14624
- ggml_compute_forward_sqr(params, tensor->src0, tensor);
14625
  } break;
14626
  case GGML_OP_SQRT:
14627
  {
14628
- ggml_compute_forward_sqrt(params, tensor->src0, tensor);
14629
  } break;
14630
  case GGML_OP_LOG:
14631
  {
14632
- ggml_compute_forward_log(params, tensor->src0, tensor);
14633
  } break;
14634
  case GGML_OP_SUM:
14635
  {
14636
- ggml_compute_forward_sum(params, tensor->src0, tensor);
14637
  } break;
14638
  case GGML_OP_SUM_ROWS:
14639
  {
14640
- ggml_compute_forward_sum_rows(params, tensor->src0, tensor);
14641
  } break;
14642
  case GGML_OP_MEAN:
14643
  {
14644
- ggml_compute_forward_mean(params, tensor->src0, tensor);
14645
  } break;
14646
  case GGML_OP_ARGMAX:
14647
  {
14648
- ggml_compute_forward_argmax(params, tensor->src0, tensor);
14649
  } break;
14650
  case GGML_OP_REPEAT:
14651
  {
14652
- ggml_compute_forward_repeat(params, tensor->src0, tensor);
14653
  } break;
14654
  case GGML_OP_REPEAT_BACK:
14655
  {
14656
- ggml_compute_forward_repeat_back(params, tensor->src0, tensor);
14657
  } break;
14658
  case GGML_OP_ABS:
14659
  {
14660
- ggml_compute_forward_abs(params, tensor->src0, tensor);
14661
  } break;
14662
  case GGML_OP_SGN:
14663
  {
14664
- ggml_compute_forward_sgn(params, tensor->src0, tensor);
14665
  } break;
14666
  case GGML_OP_NEG:
14667
  {
14668
- ggml_compute_forward_neg(params, tensor->src0, tensor);
14669
  } break;
14670
  case GGML_OP_STEP:
14671
  {
14672
- ggml_compute_forward_step(params, tensor->src0, tensor);
14673
  } break;
14674
  case GGML_OP_TANH:
14675
  {
14676
- ggml_compute_forward_tanh(params, tensor->src0, tensor);
14677
  } break;
14678
  case GGML_OP_ELU:
14679
  {
14680
- ggml_compute_forward_elu(params, tensor->src0, tensor);
14681
  } break;
14682
  case GGML_OP_RELU:
14683
  {
14684
- ggml_compute_forward_relu(params, tensor->src0, tensor);
14685
  } break;
14686
  case GGML_OP_GELU:
14687
  {
14688
- ggml_compute_forward_gelu(params, tensor->src0, tensor);
14689
  } break;
14690
  case GGML_OP_GELU_QUICK:
14691
  {
14692
- ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
14693
  } break;
14694
  case GGML_OP_SILU:
14695
  {
14696
- ggml_compute_forward_silu(params, tensor->src0, tensor);
14697
  } break;
14698
  case GGML_OP_SILU_BACK:
14699
  {
14700
- ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor);
14701
  } break;
14702
  case GGML_OP_NORM:
14703
  {
14704
- ggml_compute_forward_norm(params, tensor->src0, tensor);
14705
  } break;
14706
  case GGML_OP_RMS_NORM:
14707
  {
14708
- ggml_compute_forward_rms_norm(params, tensor->src0, tensor);
14709
  } break;
14710
  case GGML_OP_RMS_NORM_BACK:
14711
  {
14712
- ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor);
14713
  } break;
14714
  case GGML_OP_MUL_MAT:
14715
  {
14716
- ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
14717
  } break;
14718
  case GGML_OP_OUT_PROD:
14719
  {
14720
- ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor);
14721
  } break;
14722
  case GGML_OP_SCALE:
14723
  {
14724
- ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);
14725
  } break;
14726
  case GGML_OP_SET:
14727
  {
14728
- ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
14729
  } break;
14730
  case GGML_OP_CPY:
14731
  {
14732
- ggml_compute_forward_cpy(params, tensor->src0, tensor);
14733
  } break;
14734
  case GGML_OP_CONT:
14735
  {
14736
- ggml_compute_forward_cont(params, tensor->src0, tensor);
14737
  } break;
14738
  case GGML_OP_RESHAPE:
14739
  {
14740
- ggml_compute_forward_reshape(params, tensor->src0, tensor);
14741
  } break;
14742
  case GGML_OP_VIEW:
14743
  {
14744
- ggml_compute_forward_view(params, tensor->src0);
14745
  } break;
14746
  case GGML_OP_PERMUTE:
14747
  {
14748
- ggml_compute_forward_permute(params, tensor->src0);
14749
  } break;
14750
  case GGML_OP_TRANSPOSE:
14751
  {
14752
- ggml_compute_forward_transpose(params, tensor->src0);
14753
  } break;
14754
  case GGML_OP_GET_ROWS:
14755
  {
14756
- ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor);
14757
  } break;
14758
  case GGML_OP_GET_ROWS_BACK:
14759
  {
14760
- ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
14761
  } break;
14762
  case GGML_OP_DIAG:
14763
  {
14764
- ggml_compute_forward_diag(params, tensor->src0, tensor);
14765
  } break;
14766
  case GGML_OP_DIAG_MASK_INF:
14767
  {
14768
- ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);
14769
  } break;
14770
  case GGML_OP_DIAG_MASK_ZERO:
14771
  {
14772
- ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor);
14773
  } break;
14774
  case GGML_OP_SOFT_MAX:
14775
  {
14776
- ggml_compute_forward_soft_max(params, tensor->src0, tensor);
14777
  } break;
14778
  case GGML_OP_SOFT_MAX_BACK:
14779
  {
14780
- ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor);
14781
  } break;
14782
  case GGML_OP_ROPE:
14783
  {
14784
- ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
14785
  } break;
14786
  case GGML_OP_ROPE_BACK:
14787
  {
14788
- ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor);
14789
  } break;
14790
  case GGML_OP_ALIBI:
14791
  {
14792
- ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
14793
  } break;
14794
  case GGML_OP_CLAMP:
14795
  {
14796
- ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
14797
  } break;
14798
  case GGML_OP_CONV_1D:
14799
  {
14800
- ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
14801
  } break;
14802
  case GGML_OP_CONV_2D:
14803
  {
14804
- ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
14805
  } break;
14806
  case GGML_OP_FLASH_ATTN:
14807
  {
14808
- const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
14809
  GGML_ASSERT(t == 0 || t == 1);
14810
  const bool masked = t != 0;
14811
- ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
14812
  } break;
14813
  case GGML_OP_FLASH_FF:
14814
  {
14815
- ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
14816
  } break;
14817
  case GGML_OP_FLASH_ATTN_BACK:
14818
  {
14819
- int32_t t = ggml_get_i32_1d(tensor->opt[2], 0);
14820
  GGML_ASSERT(t == 0 || t == 1);
14821
  bool masked = t != 0;
14822
- ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
14823
  } break;
14824
  case GGML_OP_WIN_PART:
14825
  {
14826
- ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
14827
  } break;
14828
  case GGML_OP_WIN_UNPART:
14829
  {
14830
- ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
14831
  } break;
14832
  case GGML_OP_MAP_UNARY:
14833
  {
14834
- const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
14835
- ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
14836
  }
14837
  break;
14838
  case GGML_OP_MAP_BINARY:
14839
  {
14840
- const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
14841
- ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
14842
  }
14843
  break;
14844
  case GGML_OP_MAP_CUSTOM1:
14845
  {
14846
- const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
14847
- ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
14848
  }
14849
  break;
14850
  case GGML_OP_MAP_CUSTOM2:
14851
  {
14852
- const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
14853
- ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
14854
  }
14855
  break;
14856
  case GGML_OP_MAP_CUSTOM3:
14857
  {
14858
- const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
14859
- ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
14860
  }
14861
  break;
14862
  case GGML_OP_CROSS_ENTROPY_LOSS:
14863
  {
14864
- ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
14865
  }
14866
  break;
14867
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
14868
  {
14869
- ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor);
14870
  }
14871
  break;
14872
  case GGML_OP_NONE:
@@ -14883,8 +14906,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14883
  ////////////////////////////////////////////////////////////////////////////////
14884
 
14885
  static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
14886
- struct ggml_tensor * src0 = tensor->src0;
14887
- struct ggml_tensor * src1 = tensor->src1;
14888
 
14889
  switch (tensor->op) {
14890
  case GGML_OP_DUP:
@@ -14920,12 +14943,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14920
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
14921
  }
14922
  if (src1->grad) {
14923
- GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5);
14924
- GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32);
14925
- const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0];
14926
- const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1];
14927
- const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2];
14928
- const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3];
14929
 
14930
  struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
14931
  tensor->grad,
@@ -15233,12 +15256,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15233
  } break;
15234
  case GGML_OP_SET:
15235
  {
15236
- GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5);
15237
- GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32);
15238
- const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0];
15239
- const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1];
15240
- const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2];
15241
- const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3];
15242
 
15243
  struct ggml_tensor * tensor_grad_view = NULL;
15244
 
@@ -15315,8 +15338,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15315
  if (src0->grad) {
15316
  size_t offset;
15317
 
15318
- GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->opt[0]));
15319
- memcpy(&offset, tensor->opt[0]->data, sizeof(offset));
15320
 
15321
  size_t nb1 = tensor->nb[1];
15322
  size_t nb2 = tensor->nb[2];
@@ -15343,7 +15366,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15343
  {
15344
  // necessary for llama
15345
  if (src0->grad) {
15346
- int32_t * axes = (int32_t *) tensor->opt[0]->data;
15347
  int axis0 = axes[0] & 0x3;
15348
  int axis1 = axes[1] & 0x3;
15349
  int axis2 = axes[2] & 0x3;
@@ -15506,15 +15529,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15506
  case GGML_OP_FLASH_ATTN:
15507
  {
15508
  struct ggml_tensor * flash_grad = NULL;
15509
- if (src0->grad || src1->grad || tensor->opt[0]->grad) {
15510
- int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
15511
  GGML_ASSERT(t == 0 || t == 1);
15512
  bool masked = t != 0;
15513
  flash_grad =
15514
  ggml_flash_attn_back(ctx,
15515
  src0,
15516
  src1,
15517
- tensor->opt[0],
15518
  tensor->grad,
15519
  masked);
15520
  }
@@ -15611,7 +15634,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15611
  inplace);
15612
  }
15613
 
15614
- struct ggml_tensor * opt0 = tensor->opt[0];
15615
 
15616
  if (opt0->grad) {
15617
  struct ggml_tensor * grad_v = NULL;
@@ -15727,17 +15750,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15727
  }
15728
  }
15729
 
15730
- if (node->src0) {
15731
- ggml_visit_parents(cgraph, node->src0);
15732
- }
15733
-
15734
- if (node->src1) {
15735
- ggml_visit_parents(cgraph, node->src1);
15736
- }
15737
-
15738
- for (int i = 0; i < GGML_MAX_OPT; ++i) {
15739
- if (node->opt[i]) {
15740
- ggml_visit_parents(cgraph, node->opt[i]);
15741
  }
15742
  }
15743
 
@@ -15792,9 +15807,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
15792
  struct ggml_cgraph result = {
15793
  /*.n_nodes =*/ 0,
15794
  /*.n_leafs =*/ 0,
15795
- /*.n_threads =*/ GGML_DEFAULT_N_THREADS,
15796
- /*.work_size =*/ 0,
15797
- /*.work =*/ NULL,
15798
  /*.nodes =*/ { NULL },
15799
  /*.grads =*/ { NULL },
15800
  /*.leafs =*/ { NULL },
@@ -15965,12 +15977,13 @@ void clear_numa_thread_affinity(void) {}
15965
  #endif
15966
 
15967
  struct ggml_compute_state_shared {
15968
- struct ggml_cgraph * cgraph;
 
15969
 
15970
  int64_t perf_node_start_cycles;
15971
  int64_t perf_node_start_time_us;
15972
 
15973
- int n_threads;
15974
 
15975
  // synchronization primitives
15976
  atomic_int n_active; // num active threads
@@ -15994,9 +16007,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
15994
 
15995
  static thread_ret_t ggml_graph_compute_thread(void * data) {
15996
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
15997
- struct ggml_cgraph * cgraph = state->shared->cgraph;
15998
 
15999
- const int n_threads = state->shared->n_threads;
 
 
 
 
 
16000
  set_numa_thread_affinity(state->ith, n_threads);
16001
 
16002
  int node_n = -1;
@@ -16009,15 +16026,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16009
  /*.type =*/ GGML_TASK_FINALIZE,
16010
  /*.ith =*/ 0,
16011
  /*.nth =*/ 0,
16012
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16013
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16014
  };
16015
 
16016
  if (node_n != -1) {
16017
  /* FINALIZE */
16018
  struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16019
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16020
- params.nth = node->n_tasks;
16021
  ggml_compute_forward(&params, node);
16022
  ggml_graph_compute_perf_stats_node(node, state->shared);
16023
  }
@@ -16028,11 +16045,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16028
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16029
 
16030
  struct ggml_tensor * node = cgraph->nodes[node_n];
 
16031
 
16032
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16033
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
16034
 
16035
- params.nth = node->n_tasks;
16036
 
16037
  /* INIT */
16038
  if (GGML_OP_HAS_INIT[node->op]) {
@@ -16040,7 +16058,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16040
  ggml_compute_forward(&params, node);
16041
  }
16042
 
16043
- if (node->n_tasks == 1) {
16044
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16045
  // they do something more efficient than spinning (?)
16046
  params.type = GGML_TASK_COMPUTE;
@@ -16062,7 +16080,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16062
  // wait for other threads to finish
16063
  const int last = node_n;
16064
  do {
16065
- sched_yield();
16066
  node_n = atomic_load(&state->shared->node_n);
16067
  } while (node_n == last);
16068
  }
@@ -16072,16 +16090,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16072
 
16073
  /* COMPUTE */
16074
  struct ggml_tensor * node = cgraph->nodes[node_n];
 
16075
 
16076
  struct ggml_compute_params params = {
16077
  /*.type =*/ GGML_TASK_COMPUTE,
16078
  /*.ith =*/ state->ith,
16079
- /*.nth =*/ node->n_tasks,
16080
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16081
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16082
  };
16083
 
16084
- if (state->ith < node->n_tasks) {
16085
  ggml_compute_forward(&params, node);
16086
  }
16087
  }
@@ -16089,349 +16108,372 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16089
  return 0;
16090
  }
16091
 
16092
- void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
16093
- const int n_threads = cgraph->n_threads;
 
 
16094
 
16095
- struct ggml_compute_state_shared state_shared = {
16096
- /*.cgraph =*/ cgraph,
16097
- /*.perf_node_start_cycles =*/ 0,
16098
- /*.perf_node_start_time_us =*/ 0,
16099
- /*.n_threads =*/ n_threads,
16100
- /*.n_active =*/ n_threads,
16101
- /*.node_n =*/ -1,
16102
- };
16103
- struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
16104
 
16105
- // initialize tasks + work buffer
16106
- {
16107
- size_t work_size = 0;
16108
 
16109
- // thread scheduling for the different operations
16110
- for (int i = 0; i < cgraph->n_nodes; i++) {
16111
- struct ggml_tensor * node = cgraph->nodes[i];
16112
 
16113
- switch (node->op) {
16114
- case GGML_OP_CPY:
16115
- case GGML_OP_DUP:
16116
- {
16117
- node->n_tasks = n_threads;
16118
 
16119
- size_t cur = 0;
16120
- if (ggml_is_quantized(node->type)) {
16121
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
16122
- }
 
16123
 
16124
- work_size = MAX(work_size, cur);
16125
- } break;
16126
- case GGML_OP_ADD:
16127
- case GGML_OP_ADD1:
16128
- {
16129
- node->n_tasks = n_threads;
16130
 
16131
- size_t cur = 0;
 
 
 
 
 
16132
 
16133
- if (ggml_is_quantized(node->src0->type)) {
16134
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
16135
- }
16136
 
16137
- work_size = MAX(work_size, cur);
16138
- } break;
16139
- case GGML_OP_ACC:
16140
- {
16141
- node->n_tasks = n_threads;
16142
 
16143
- size_t cur = 0;
 
 
 
 
16144
 
16145
- if (ggml_is_quantized(node->src0->type)) {
16146
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
16147
- }
16148
 
16149
- work_size = MAX(work_size, cur);
16150
- } break;
16151
- case GGML_OP_SUB:
16152
- case GGML_OP_DIV:
16153
- case GGML_OP_SQR:
16154
- case GGML_OP_SQRT:
16155
- case GGML_OP_LOG:
16156
- case GGML_OP_SUM:
16157
- case GGML_OP_SUM_ROWS:
16158
- case GGML_OP_MEAN:
16159
- case GGML_OP_ARGMAX:
16160
- case GGML_OP_REPEAT:
16161
- case GGML_OP_REPEAT_BACK:
16162
- case GGML_OP_ABS:
16163
- case GGML_OP_SGN:
16164
- case GGML_OP_NEG:
16165
- case GGML_OP_STEP:
16166
- case GGML_OP_TANH:
16167
- case GGML_OP_ELU:
16168
- case GGML_OP_RELU:
16169
- {
16170
- node->n_tasks = 1;
16171
- } break;
16172
- case GGML_OP_MUL:
16173
- case GGML_OP_GELU:
16174
- case GGML_OP_GELU_QUICK:
16175
- case GGML_OP_SILU:
16176
- case GGML_OP_SILU_BACK:
16177
- case GGML_OP_NORM:
16178
- case GGML_OP_RMS_NORM:
16179
- case GGML_OP_RMS_NORM_BACK:
16180
- {
16181
- node->n_tasks = n_threads;
16182
- } break;
16183
- case GGML_OP_MUL_MAT:
16184
- case GGML_OP_OUT_PROD:
16185
- {
16186
- node->n_tasks = n_threads;
16187
-
16188
- // TODO: use different scheduling for different matrix sizes
16189
- //const int nr0 = ggml_nrows(node->src0);
16190
- //const int nr1 = ggml_nrows(node->src1);
16191
-
16192
- //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
16193
- //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
16194
-
16195
- size_t cur = 0;
16196
- const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
 
 
 
 
16197
 
16198
  #if defined(GGML_USE_CUBLAS)
16199
- if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
16200
- node->n_tasks = 1; // TODO: this actually is doing nothing
16201
- // the threads are still spinning
16202
- }
16203
- else
16204
  #elif defined(GGML_USE_CLBLAST)
16205
- if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
16206
- node->n_tasks = 1; // TODO: this actually is doing nothing
16207
- // the threads are still spinning
16208
- cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
16209
- }
16210
- else
16211
  #endif
16212
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16213
- if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
16214
- node->n_tasks = 1; // TODO: this actually is doing nothing
16215
- // the threads are still spinning
16216
- if (node->src0->type != GGML_TYPE_F32) {
16217
- // here we need memory just for single 2D matrix from src0
16218
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
16219
- }
16220
- } else
16221
- #endif
16222
- if (node->src1->type != vec_dot_type) {
16223
- cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
16224
- } else {
16225
- cur = 0;
16226
  }
 
 
 
 
 
 
 
16227
 
16228
- work_size = MAX(work_size, cur);
16229
- } break;
16230
- case GGML_OP_SCALE:
16231
- {
16232
- node->n_tasks = 1;
16233
- } break;
16234
- case GGML_OP_SET:
16235
- case GGML_OP_CONT:
16236
- case GGML_OP_RESHAPE:
16237
- case GGML_OP_VIEW:
16238
- case GGML_OP_PERMUTE:
16239
- case GGML_OP_TRANSPOSE:
16240
- case GGML_OP_GET_ROWS:
16241
- case GGML_OP_GET_ROWS_BACK:
16242
- case GGML_OP_DIAG:
16243
- case GGML_OP_DIAG_MASK_ZERO:
16244
- {
16245
- node->n_tasks = 1;
16246
- } break;
16247
- case GGML_OP_DIAG_MASK_INF:
16248
- case GGML_OP_SOFT_MAX:
16249
- case GGML_OP_SOFT_MAX_BACK:
16250
- case GGML_OP_ROPE:
16251
- case GGML_OP_ROPE_BACK:
16252
- {
16253
- node->n_tasks = n_threads;
16254
- } break;
16255
- case GGML_OP_ALIBI:
16256
- {
16257
- node->n_tasks = 1; //TODO
16258
- } break;
16259
- case GGML_OP_CLAMP:
16260
- {
16261
- node->n_tasks = 1; //TODO
16262
- } break;
16263
- case GGML_OP_CONV_1D:
16264
- {
16265
- node->n_tasks = n_threads;
16266
-
16267
- GGML_ASSERT(node->src0->ne[3] == 1);
16268
- GGML_ASSERT(node->src1->ne[2] == 1);
16269
- GGML_ASSERT(node->src1->ne[3] == 1);
16270
-
16271
- size_t cur = 0;
16272
- const int nk = node->src0->ne[0];
16273
-
16274
- if (node->src0->type == GGML_TYPE_F16 &&
16275
- node->src1->type == GGML_TYPE_F32) {
16276
- cur = sizeof(ggml_fp16_t)*(
16277
- nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
16278
- ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
16279
- );
16280
- } else if (node->src0->type == GGML_TYPE_F32 &&
16281
- node->src1->type == GGML_TYPE_F32) {
16282
- cur = sizeof(float)*(
16283
- nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
16284
- ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
16285
- );
16286
- } else {
16287
- GGML_ASSERT(false);
16288
- }
16289
 
16290
- work_size = MAX(work_size, cur);
16291
- } break;
16292
- case GGML_OP_CONV_2D:
16293
- {
16294
- node->n_tasks = n_threads;
16295
 
16296
- GGML_ASSERT(node->src1->ne[3] == 1);
16297
 
16298
- const int64_t ne00 = node->src0->ne[0]; // W
16299
- const int64_t ne01 = node->src0->ne[1]; // H
16300
- const int64_t ne02 = node->src0->ne[2]; // C
16301
- const int64_t ne03 = node->src0->ne[3]; // N
16302
 
16303
- const int64_t ne10 = node->src1->ne[0]; // W
16304
- const int64_t ne11 = node->src1->ne[1]; // H
16305
- const int64_t ne12 = node->src1->ne[2]; // C
16306
 
16307
- const int64_t nk = ne00*ne01;
16308
 
16309
- UNUSED(ne02);
16310
- UNUSED(ne03);
16311
- UNUSED(nk);
16312
 
16313
- size_t cur = 0;
16314
 
16315
- if (node->src0->type == GGML_TYPE_F16 &&
16316
- node->src1->type == GGML_TYPE_F32) {
16317
- cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16318
- } else if (node->src0->type == GGML_TYPE_F32 &&
16319
- node->src1->type == GGML_TYPE_F32) {
16320
- cur = sizeof(float)* (ne10*ne11*ne12);
16321
- } else {
16322
- GGML_ASSERT(false);
16323
- }
16324
 
16325
- work_size = MAX(work_size, cur);
16326
- } break;
16327
- case GGML_OP_FLASH_ATTN:
16328
- {
16329
- node->n_tasks = n_threads;
16330
 
16331
- size_t cur = 0;
16332
 
16333
- const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
16334
 
16335
- if (node->src1->type == GGML_TYPE_F32) {
16336
- cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
16337
- cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
16338
- }
16339
 
16340
- if (node->src1->type == GGML_TYPE_F16) {
16341
- cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
16342
- cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
16343
- }
16344
 
16345
- work_size = MAX(work_size, cur);
16346
- } break;
16347
- case GGML_OP_FLASH_FF:
16348
- {
16349
- node->n_tasks = n_threads;
16350
 
16351
- size_t cur = 0;
16352
 
16353
- if (node->src1->type == GGML_TYPE_F32) {
16354
- cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
16355
- cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
16356
- }
16357
 
16358
- if (node->src1->type == GGML_TYPE_F16) {
16359
- cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
16360
- cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
16361
- }
16362
 
16363
- work_size = MAX(work_size, cur);
16364
- } break;
16365
- case GGML_OP_FLASH_ATTN_BACK:
16366
- {
16367
- node->n_tasks = n_threads;
16368
 
16369
- size_t cur = 0;
16370
 
16371
- const int64_t D = node->src0->ne[0];
16372
- const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
16373
- const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
16374
- if (node->src1->type == GGML_TYPE_F32) {
16375
- cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
16376
- cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
16377
- }
16378
 
16379
- if (node->src1->type == GGML_TYPE_F16) {
16380
- cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
16381
- cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
16382
- }
16383
 
16384
- work_size = MAX(work_size, cur);
16385
- } break;
16386
- case GGML_OP_WIN_PART:
16387
- case GGML_OP_WIN_UNPART:
16388
- case GGML_OP_MAP_UNARY:
16389
- case GGML_OP_MAP_BINARY:
16390
- case GGML_OP_MAP_CUSTOM1:
16391
- case GGML_OP_MAP_CUSTOM2:
16392
- case GGML_OP_MAP_CUSTOM3:
16393
- {
16394
- node->n_tasks = 1;
16395
- } break;
16396
- case GGML_OP_CROSS_ENTROPY_LOSS:
16397
- {
16398
- node->n_tasks = n_threads;
16399
-
16400
- size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
16401
-
16402
- work_size = MAX(work_size, cur);
16403
- } break;
16404
- case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16405
- {
16406
- node->n_tasks = n_threads;
16407
-
16408
- size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
16409
-
16410
- work_size = MAX(work_size, cur);
16411
- } break;
16412
- case GGML_OP_NONE:
16413
- {
16414
- node->n_tasks = 1;
16415
- } break;
16416
- case GGML_OP_COUNT:
16417
- {
16418
- GGML_ASSERT(false);
16419
- } break;
16420
- }
16421
- }
16422
 
16423
- if (cgraph->work != NULL && work_size > cgraph->work_size) {
16424
- GGML_ASSERT(false); // TODO: better handling
 
 
 
 
 
 
 
 
 
 
16425
  }
16426
 
16427
- if (work_size > 0 && cgraph->work == NULL) {
16428
- cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
16429
 
16430
- GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
16431
- cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16432
  }
16433
  }
16434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16435
  // create thread pool
16436
  if (n_threads > 1) {
16437
  for (int j = 1; j < n_threads; ++j) {
@@ -16493,6 +16535,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16493
  }
16494
  }
16495
 
 
 
 
 
 
 
 
 
 
 
 
16496
  struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
16497
  for (int i = 0; i < cgraph->n_leafs; i++) {
16498
  struct ggml_tensor * leaf = cgraph->leafs[i];
@@ -16531,14 +16584,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
16531
  const int64_t * ne = tensor->ne;
16532
  const size_t * nb = tensor->nb;
16533
 
16534
- fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
16535
  arg,
16536
  ggml_type_name(tensor->type),
16537
  ggml_op_name (tensor->op),
16538
  tensor->n_dims,
16539
  ne[0], ne[1], ne[2], ne[3],
16540
  nb[0], nb[1], nb[2], nb[3],
16541
- tensor->n_tasks,
16542
  tensor->data,
16543
  tensor->name);
16544
  }
@@ -16575,8 +16627,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16575
  ggml_graph_export_leaf(cgraph->leafs[i], fout);
16576
 
16577
  GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
16578
- GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
16579
- GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
16580
  }
16581
 
16582
  // header
@@ -16587,17 +16639,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16587
  for (int i = 0; i < cgraph->n_nodes; ++i) {
16588
  ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
16589
 
16590
- if (cgraph->nodes[i]->src0) {
16591
- ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
16592
- }
16593
-
16594
- if (cgraph->nodes[i]->src1) {
16595
- ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
16596
- }
16597
-
16598
- for (int j = 0; j < GGML_MAX_OPT; ++j) {
16599
- if (cgraph->nodes[i]->opt[j]) {
16600
- ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
16601
  }
16602
  }
16603
 
@@ -16688,16 +16732,13 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16688
 
16689
  // output the op arguments
16690
  {
16691
- struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
16692
 
16693
- args[0] = tensor->src0;
16694
- args[1] = tensor->src1;
16695
-
16696
- for (int j = 0; j < GGML_MAX_OPT; ++j) {
16697
- args[2 + j] = tensor->opt[j];
16698
  }
16699
 
16700
- for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
16701
  if (args[j]) {
16702
  int32_t idx = -1;
16703
 
@@ -16915,12 +16956,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
16915
 
16916
  const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16917
 
16918
- const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
16919
 
16920
- struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
16921
 
16922
  // parse args
16923
- for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
16924
  const int32_t arg_idx = ptr_arg_idx[j];
16925
 
16926
  if (arg_idx == -1) {
@@ -16977,11 +17018,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
16977
  tensor->nb[j] = nb[j];
16978
  }
16979
 
16980
- tensor->src0 = args[0];
16981
- tensor->src1 = args[1];
16982
-
16983
- for (int j = 0; j < GGML_MAX_OPT; ++j) {
16984
- tensor->opt[j] = args[2 + j];
16985
  }
16986
 
16987
  result.nodes[i] = tensor;
@@ -17180,19 +17218,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17180
  for (int i = 0; i < gb->n_nodes; i++) {
17181
  struct ggml_tensor * node = gb->nodes[i];
17182
 
17183
- if (node->src0) {
17184
- ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
17185
- }
17186
-
17187
- if (node->src1) {
17188
- ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
17189
- }
17190
-
17191
- for (int j = 0; j < GGML_MAX_OPT; j++) {
17192
- if (node->opt[j]) {
17193
  char label[16];
17194
- snprintf(label, sizeof(label), "opt %d", j);
17195
- ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
17196
  }
17197
  }
17198
  }
@@ -17200,19 +17230,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17200
  for (int i = 0; i < gb->n_leafs; i++) {
17201
  struct ggml_tensor * node = gb->leafs[i];
17202
 
17203
- if (node->src0) {
17204
- ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
17205
- }
17206
-
17207
- if (node->src1) {
17208
- ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
17209
- }
17210
-
17211
- for (int j = 0; j < GGML_MAX_OPT; j++) {
17212
- if (node->opt[j]) {
17213
  char label[16];
17214
- snprintf(label, sizeof(label), "opt %d", j);
17215
- ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
17216
  }
17217
  }
17218
  }
@@ -17274,9 +17296,6 @@ static enum ggml_opt_result ggml_opt_adam(
17274
  struct ggml_cgraph * gb) {
17275
  GGML_ASSERT(ggml_is_scalar(f));
17276
 
17277
- gf->n_threads = params.n_threads;
17278
- gb->n_threads = params.n_threads;
17279
-
17280
  // these will store the parameters we want to optimize
17281
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
17282
 
@@ -17323,7 +17342,8 @@ static enum ggml_opt_result ggml_opt_adam(
17323
  // compute the function value
17324
  ggml_graph_reset (gf);
17325
  ggml_set_f32 (f->grad, 1.0f);
17326
- ggml_graph_compute(ctx, gb);
 
17327
 
17328
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
17329
  opt->adam.fx_best = opt->adam.fx_prev;
@@ -17403,7 +17423,8 @@ static enum ggml_opt_result ggml_opt_adam(
17403
 
17404
  ggml_graph_reset (gf);
17405
  ggml_set_f32 (f->grad, 1.0f);
17406
- ggml_graph_compute(ctx, gb);
 
17407
 
17408
  const float fx = ggml_get_f32_1d(f, 0);
17409
 
@@ -17525,7 +17546,8 @@ static enum ggml_opt_result linesearch_backtracking(
17525
 
17526
  ggml_graph_reset (gf);
17527
  ggml_set_f32 (f->grad, 1.0f);
17528
- ggml_graph_compute(ctx, gb);
 
17529
 
17530
  ggml_opt_get_grad(np, ps, g);
17531
 
@@ -17593,9 +17615,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17593
  }
17594
  }
17595
 
17596
- gf->n_threads = params.n_threads;
17597
- gb->n_threads = params.n_threads;
17598
-
17599
  const int m = params.lbfgs.m;
17600
 
17601
  // these will store the parameters we want to optimize
@@ -17647,7 +17666,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
17647
 
17648
  ggml_graph_reset (gf);
17649
  ggml_set_f32 (f->grad, 1.0f);
17650
- ggml_graph_compute(ctx, gb);
 
17651
 
17652
  ggml_opt_get_grad(np, ps, g);
17653
 
 
247
  #include "ggml-opencl.h"
248
  #endif
249
  #elif defined(GGML_USE_OPENBLAS)
250
+ #if defined(GGML_BLAS_USE_MKL)
251
+ #include <mkl.h>
252
+ #else
253
  #include <cblas.h>
254
+ #endif
255
  #elif defined(GGML_USE_CUBLAS)
256
  #include "ggml-cuda.h"
257
  #elif defined(GGML_USE_CLBLAST)
 
4284
  #define ggml_assert_aligned(ptr) \
4285
  GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
4286
 
4287
+ static bool useNtkRope = true; //uses linear rope if not NTK
4288
+ void set_ntk_rope_scale_mode(bool useNtk)
4289
+ {
4290
+ useNtkRope = useNtk;
4291
+ }
4292
+ bool get_ntk_rope_scale_mode()
4293
+ {
4294
+ return useNtkRope;
4295
+ }
4296
  float get_theta_scale(int n_dims,int n_past,int n_ctx)
4297
  {
4298
+ if (!get_ntk_rope_scale_mode())
4299
+ {
4300
+ return powf(10000.0, -2.0f / n_dims);
4301
+ }
4302
+ if (n_ctx <= 2048) //normie mode
4303
+ {
4304
+ return powf(10000.0, -2.0f / n_dims);
4305
+ }
4306
+ else
4307
+ {
4308
+ //using scaled NTK aware ctx
4309
+ float a = (n_ctx <= 4096 ? 4.0 : 8.0);
4310
+ float m = powf(a, n_dims / (n_dims - 2.0));
4311
+ float s = powf(10000.0 * m, -2.0f / n_dims);
4312
+ return s;
4313
+ }
4314
  }
4315
 
4316
  ////////////////////////////////////////////////////////////////////////////////
 
4614
  /*.op =*/ GGML_OP_NONE,
4615
  /*.is_param =*/ false,
4616
  /*.grad =*/ NULL,
4617
+ /*.src =*/ { NULL },
 
 
 
4618
  /*.perf_runs =*/ 0,
4619
  /*.perf_cycles =*/ 0,
4620
  /*.perf_time_us =*/ 0,
4621
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4622
  /*.name =*/ { 0 },
4623
  /*.extra =*/ NULL,
4624
+ /*.padding =*/ { 0 },
4625
  };
4626
 
4627
  // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
 
5040
 
5041
  result->op = GGML_OP_DUP;
5042
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5043
+ result->src[0] = a;
5044
+ result->src[1] = NULL;
5045
 
5046
  return result;
5047
  }
 
5077
 
5078
  result->op = GGML_OP_ADD;
5079
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5080
+ result->src[0] = a;
5081
+ result->src[1] = b;
5082
 
5083
  return result;
5084
  }
 
5117
 
5118
  result->op = GGML_OP_ADD1;
5119
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5120
+ result->src[0] = a;
5121
+ result->src[1] = b;
5122
 
5123
  return result;
5124
  }
 
5175
 
5176
  result->op = GGML_OP_ACC;
5177
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5178
+ result->src[0] = a;
5179
+ result->src[1] = b;
5180
+ result->src[2] = c;
5181
 
5182
  return result;
5183
  }
 
5223
 
5224
  result->op = GGML_OP_SUB;
5225
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5226
+ result->src[0] = a;
5227
+ result->src[1] = b;
5228
 
5229
  return result;
5230
  }
 
5270
 
5271
  result->op = GGML_OP_MUL;
5272
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5273
+ result->src[0] = a;
5274
+ result->src[1] = b;
5275
 
5276
  return result;
5277
  }
 
5313
 
5314
  result->op = GGML_OP_DIV;
5315
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5316
+ result->src[0] = a;
5317
+ result->src[1] = b;
5318
 
5319
  return result;
5320
  }
 
5349
 
5350
  result->op = GGML_OP_SQR;
5351
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5352
+ result->src[0] = a;
5353
+ result->src[1] = NULL;
5354
 
5355
  return result;
5356
  }
 
5383
 
5384
  result->op = GGML_OP_SQRT;
5385
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5386
+ result->src[0] = a;
5387
+ result->src[1] = NULL;
5388
 
5389
  return result;
5390
  }
 
5418
 
5419
  result->op = GGML_OP_LOG;
5420
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5421
+ result->src[0] = a;
5422
+ result->src[1] = NULL;
5423
 
5424
  return result;
5425
  }
 
5451
 
5452
  result->op = GGML_OP_SUM;
5453
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5454
+ result->src[0] = a;
5455
+ result->src[1] = NULL;
5456
 
5457
  return result;
5458
  }
 
5478
 
5479
  result->op = GGML_OP_SUM_ROWS;
5480
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5481
+ result->src[0] = a;
5482
+ result->src[1] = NULL;
5483
 
5484
  return result;
5485
  }
 
5501
 
5502
  result->op = GGML_OP_MEAN;
5503
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5504
+ result->src[0] = a;
5505
+ result->src[1] = NULL;
5506
 
5507
  return result;
5508
  }
 
5525
 
5526
  result->op = GGML_OP_ARGMAX;
5527
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5528
+ result->src[0] = a;
5529
+ result->src[1] = NULL;
5530
 
5531
  return result;
5532
  }
 
5553
 
5554
  result->op = GGML_OP_REPEAT;
5555
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5556
+ result->src[0] = a;
5557
+ result->src[1] = b;
5558
 
5559
  return result;
5560
  }
 
5581
 
5582
  result->op = GGML_OP_REPEAT_BACK;
5583
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5584
+ result->src[0] = a;
5585
+ result->src[1] = b;
5586
 
5587
  return result;
5588
  }
 
5603
 
5604
  result->op = GGML_OP_ABS;
5605
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5606
+ result->src[0] = a;
5607
+ result->src[1] = NULL;
5608
 
5609
  return result;
5610
  }
 
5638
 
5639
  result->op = GGML_OP_SGN;
5640
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5641
+ result->src[0] = a;
5642
+ result->src[1] = NULL;
5643
 
5644
  return result;
5645
  }
 
5672
 
5673
  result->op = GGML_OP_NEG;
5674
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5675
+ result->src[0] = a;
5676
+ result->src[1] = NULL;
5677
 
5678
  return result;
5679
  }
 
5706
 
5707
  result->op = GGML_OP_STEP;
5708
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5709
+ result->src[0] = a;
5710
+ result->src[1] = NULL;
5711
 
5712
  return result;
5713
  }
 
5740
 
5741
  result->op = GGML_OP_TANH;
5742
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5743
+ result->src[0] = a;
5744
+ result->src[1] = NULL;
5745
 
5746
  return result;
5747
  }
 
5774
 
5775
  result->op = GGML_OP_ELU;
5776
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5777
+ result->src[0] = a;
5778
+ result->src[1] = NULL;
5779
 
5780
  return result;
5781
  }
 
5808
 
5809
  result->op = GGML_OP_RELU;
5810
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5811
+ result->src[0] = a;
5812
+ result->src[1] = NULL;
5813
 
5814
  return result;
5815
  }
 
5842
 
5843
  result->op = GGML_OP_GELU;
5844
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5845
+ result->src[0] = a;
5846
+ result->src[1] = NULL;
5847
 
5848
  return result;
5849
  }
 
5876
 
5877
  result->op = GGML_OP_GELU_QUICK;
5878
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5879
+ result->src[0] = a;
5880
+ result->src[1] = NULL;
5881
 
5882
  return result;
5883
  }
 
5910
 
5911
  result->op = GGML_OP_SILU;
5912
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5913
+ result->src[0] = a;
5914
+ result->src[1] = NULL;
5915
 
5916
  return result;
5917
  }
 
5945
 
5946
  result->op = GGML_OP_SILU_BACK;
5947
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5948
+ result->src[0] = a;
5949
+ result->src[1] = b;
5950
 
5951
  return result;
5952
  }
 
5968
 
5969
  result->op = GGML_OP_NORM;
5970
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5971
+ result->src[0] = a;
5972
+ result->src[1] = NULL; // TODO: maybe store epsilon here?
5973
 
5974
  return result;
5975
  }
 
6000
 
6001
  result->op = GGML_OP_RMS_NORM;
6002
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6003
+ result->src[0] = a;
6004
+ result->src[1] = NULL; // TODO: maybe store epsilon here?
6005
 
6006
  return result;
6007
  }
 
6033
 
6034
  result->op = GGML_OP_RMS_NORM_BACK;
6035
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6036
+ result->src[0] = a;
6037
+ result->src[1] = b;
6038
 
6039
  return result;
6040
  }
 
6060
 
6061
  result->op = GGML_OP_MUL_MAT;
6062
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6063
+ result->src[0] = a;
6064
+ result->src[1] = b;
6065
 
6066
  return result;
6067
  }
 
6086
 
6087
  result->op = GGML_OP_OUT_PROD;
6088
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6089
+ result->src[0] = a;
6090
+ result->src[1] = b;
6091
 
6092
  return result;
6093
  }
 
6112
 
6113
  result->op = GGML_OP_SCALE;
6114
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6115
+ result->src[0] = a;
6116
+ result->src[1] = b;
6117
 
6118
  return result;
6119
  }
 
6168
 
6169
  result->op = GGML_OP_SET;
6170
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6171
+ result->src[0] = a;
6172
+ result->src[1] = b;
6173
+ result->src[2] = c;
6174
 
6175
  return result;
6176
  }
 
6257
 
6258
  result->op = GGML_OP_CPY;
6259
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6260
+ result->src[0] = a;
6261
+ result->src[1] = b;
6262
 
6263
  return result;
6264
  }
 
6294
 
6295
  result->op = GGML_OP_CONT;
6296
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6297
+ result->src[0] = a;
6298
+ result->src[1] = NULL;
6299
 
6300
  return result;
6301
  }
 
6338
 
6339
  result->op = GGML_OP_RESHAPE;
6340
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6341
+ result->src[0] = a;
6342
+ result->src[1] = NULL;
6343
 
6344
  return result;
6345
  }
 
6363
 
6364
  result->op = GGML_OP_RESHAPE;
6365
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6366
+ result->src[0] = a;
6367
+ result->src[1] = NULL;
6368
 
6369
  return result;
6370
  }
 
6389
 
6390
  result->op = GGML_OP_RESHAPE;
6391
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6392
+ result->src[0] = a;
6393
+ result->src[1] = NULL;
6394
 
6395
  return result;
6396
  }
 
6416
 
6417
  result->op = GGML_OP_RESHAPE;
6418
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6419
+ result->src[0] = a;
6420
+ result->src[1] = NULL;
6421
 
6422
  return result;
6423
  }
 
6445
 
6446
  result->op = GGML_OP_RESHAPE;
6447
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6448
+ result->src[0] = a;
6449
+ result->src[1] = NULL;
6450
 
6451
  return result;
6452
  }
 
6478
 
6479
  result->op = GGML_OP_VIEW;
6480
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6481
+ result->src[0] = a;
6482
+ result->src[1] = NULL;
6483
+ result->src[2] = offs;
6484
 
6485
  return result;
6486
  }
 
6520
 
6521
  result->op = GGML_OP_VIEW;
6522
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6523
+ result->src[0] = a;
6524
+ result->src[1] = NULL;
6525
+ result->src[2] = offs;
6526
 
6527
  return result;
6528
  }
 
6564
 
6565
  result->op = GGML_OP_VIEW;
6566
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6567
+ result->src[0] = a;
6568
+ result->src[1] = NULL;
6569
+ result->src[2] = offs;
6570
 
6571
  return result;
6572
  }
 
6610
 
6611
  result->op = GGML_OP_VIEW;
6612
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6613
+ result->src[0] = a;
6614
+ result->src[1] = NULL;
6615
+ result->src[2] = offs;
6616
 
6617
  return result;
6618
  }
 
6672
 
6673
  result->op = GGML_OP_PERMUTE;
6674
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6675
+ result->src[0] = a;
6676
+ result->src[1] = NULL;
6677
 
6678
  if (is_node) {
6679
  ggml_scratch_save(ctx);
 
6687
 
6688
  ggml_scratch_load(ctx);
6689
 
6690
+ result->src[2] = b;
6691
  }
6692
 
6693
  return result;
 
6715
 
6716
  result->op = GGML_OP_TRANSPOSE;
6717
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6718
+ result->src[0] = a;
6719
+ result->src[1] = NULL;
6720
 
6721
  return result;
6722
  }
 
6741
 
6742
  result->op = GGML_OP_GET_ROWS;
6743
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6744
+ result->src[0] = a;
6745
+ result->src[1] = b;
6746
 
6747
  return result;
6748
  }
 
6769
 
6770
  result->op = GGML_OP_GET_ROWS_BACK;
6771
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6772
+ result->src[0] = a;
6773
+ result->src[1] = b;
6774
+ result->src[2] = c;
6775
 
6776
  return result;
6777
  }
 
6793
 
6794
  result->op = GGML_OP_DIAG;
6795
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6796
+ result->src[0] = a;
6797
+ result->src[1] = NULL;
6798
 
6799
  return result;
6800
  }
 
6826
 
6827
  result->op = GGML_OP_DIAG_MASK_INF;
6828
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6829
+ result->src[0] = a;
6830
+ result->src[1] = b;
6831
 
6832
  return result;
6833
  }
 
6874
 
6875
  result->op = GGML_OP_DIAG_MASK_ZERO;
6876
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6877
+ result->src[0] = a;
6878
+ result->src[1] = b;
6879
 
6880
  return result;
6881
  }
 
6910
 
6911
  result->op = GGML_OP_SOFT_MAX;
6912
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6913
+ result->src[0] = a;
6914
+ result->src[1] = NULL;
6915
 
6916
  return result;
6917
  }
 
6946
 
6947
  result->op = GGML_OP_SOFT_MAX_BACK;
6948
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6949
+ result->src[0] = a;
6950
+ result->src[1] = b;
6951
 
6952
  return result;
6953
  }
 
6998
 
6999
  result->op = GGML_OP_ROPE;
7000
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7001
+ result->src[0] = a;
7002
+ result->src[1] = b;
7003
 
7004
  return result;
7005
  }
 
7056
 
7057
  result->op = GGML_OP_ROPE_BACK;
7058
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7059
+ result->src[0] = a;
7060
+ result->src[1] = b;
7061
 
7062
  return result;
7063
  }
 
7095
 
7096
  result->op = GGML_OP_ALIBI;
7097
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7098
+ result->src[0] = a;
7099
+ result->src[1] = b;
7100
 
7101
  return result;
7102
  }
 
7129
 
7130
  result->op = GGML_OP_CLAMP;
7131
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7132
+ result->src[0] = a;
7133
+ result->src[1] = b;
7134
 
7135
  return result;
7136
  }
 
7172
 
7173
  result->op = GGML_OP_CONV_1D;
7174
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7175
+ result->src[0] = a;
7176
+ result->src[1] = b;
7177
+ result->src[2] = c;
7178
 
7179
  return result;
7180
  }
 
7220
 
7221
  result->op = GGML_OP_CONV_2D;
7222
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7223
+ result->src[0] = a;
7224
+ result->src[1] = b;
7225
+ result->src[2] = c;
7226
 
7227
  return result;
7228
 
 
7261
 
7262
  result->op = GGML_OP_FLASH_ATTN;
7263
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7264
+ result->src[0] = q;
7265
+ result->src[1] = k;
7266
+ result->src[2] = v;
7267
+ result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0);
7268
 
7269
  return result;
7270
  }
 
7292
 
7293
  result->op = GGML_OP_FLASH_FF;
7294
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7295
+ result->src[0] = a;
7296
+ result->src[1] = b0;
7297
+ result->src[2] = b1;
7298
+ result->src[3] = c0;
7299
+ result->src[4] = c1;
7300
 
7301
  return result;
7302
  }
 
7356
 
7357
  result->op = GGML_OP_FLASH_ATTN_BACK;
7358
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7359
+ result->src[0] = q;
7360
+ result->src[1] = k;
7361
+ result->src[2] = v;
7362
+ result->src[3] = d;
7363
+ result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0);
7364
 
7365
  return result;
7366
  }
 
7405
 
7406
  result->op = GGML_OP_WIN_PART;
7407
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7408
+ result->src[0] = a;
7409
+ result->src[1] = NULL;
7410
+ result->src[2] = b;
7411
 
7412
  return result;
7413
  }
 
7442
 
7443
  result->op = GGML_OP_WIN_UNPART;
7444
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7445
+ result->src[0] = a;
7446
+ result->src[1] = NULL;
7447
+ result->src[2] = b;
7448
 
7449
  return result;
7450
  }
 
7473
 
7474
  result->op = GGML_OP_MAP_UNARY;
7475
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7476
+ result->src[0] = a;
7477
+ result->src[2] = addr_tensor;
7478
 
7479
  return result;
7480
  }
 
7520
 
7521
  result->op = GGML_OP_MAP_BINARY;
7522
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7523
+ result->src[0] = a;
7524
+ result->src[1] = b;
7525
+ result->src[2] = addr_tensor;
7526
 
7527
  return result;
7528
  }
 
7567
 
7568
  result->op = GGML_OP_MAP_CUSTOM1;
7569
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7570
+ result->src[0] = a;
7571
+ result->src[2] = addr_tensor;
7572
 
7573
  return result;
7574
  }
 
7612
 
7613
  result->op = GGML_OP_MAP_CUSTOM2;
7614
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7615
+ result->src[0] = a;
7616
+ result->src[1] = b;
7617
+ result->src[2] = addr_tensor;
7618
 
7619
  return result;
7620
  }
 
7661
 
7662
  result->op = GGML_OP_MAP_CUSTOM3;
7663
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7664
+ result->src[0] = a;
7665
+ result->src[1] = b;
7666
+ result->src[2] = addr_tensor;
7667
+ result->src[3] = c;
7668
 
7669
  return result;
7670
  }
 
7704
 
7705
  result->op = GGML_OP_CROSS_ENTROPY_LOSS;
7706
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7707
+ result->src[0] = a;
7708
+ result->src[1] = b;
7709
 
7710
  return result;
7711
  }
 
7724
 
7725
  result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
7726
  result->grad = NULL;
7727
+ result->src[0] = a;
7728
+ result->src[1] = b;
7729
+ result->src[2] = c;
7730
 
7731
  return result;
7732
  }
 
10749
 
10750
  float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
10751
 
 
 
10752
  for (int64_t ic = 0; ic < ne11; ++ic) {
10753
  vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
10754
  }
 
12055
  dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
12056
  }
12057
  } else if (!is_neox) {
12058
+ if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
12059
+ theta = theta * GGML_TRAINING_CTX / n_ctx;
12060
+ }
12061
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12062
  const float cos_theta = cosf(theta);
12063
  const float sin_theta = sinf(theta);
 
12185
  dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
12186
  }
12187
  } if (!is_neox) {
12188
+ if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
12189
+ theta = theta * GGML_TRAINING_CTX / n_ctx;
12190
+ }
12191
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12192
  const float cos_theta = cosf(theta);
12193
  const float sin_theta = sinf(theta);
 
12313
  float theta = (float)p;
12314
 
12315
  if (!is_neox) {
12316
+ if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
12317
+ theta = theta * GGML_TRAINING_CTX / n_ctx;
12318
+ }
12319
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12320
  const float cos_theta = cosf(theta);
12321
  const float sin_theta = sinf(theta);
 
12416
  float theta = (float)p;
12417
 
12418
  if (!is_neox) {
12419
+ if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) {
12420
+ theta = theta * GGML_TRAINING_CTX / n_ctx;
12421
+ }
12422
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12423
  const float cos_theta = cosf(theta);
12424
  const float sin_theta = sinf(theta);
 
14609
  if (skip_cpu) {
14610
  return;
14611
  }
14612
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
14613
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
14614
  #endif // GGML_USE_CUBLAS
14615
 
14616
  switch (tensor->op) {
14617
  case GGML_OP_DUP:
14618
  {
14619
+ ggml_compute_forward_dup(params, tensor->src[0], tensor);
14620
  } break;
14621
  case GGML_OP_ADD:
14622
  {
14623
+ ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
14624
  } break;
14625
  case GGML_OP_ADD1:
14626
  {
14627
+ ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
14628
  } break;
14629
  case GGML_OP_ACC:
14630
  {
14631
+ ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14632
  } break;
14633
  case GGML_OP_SUB:
14634
  {
14635
+ ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
14636
  } break;
14637
  case GGML_OP_MUL:
14638
  {
14639
+ ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
14640
  } break;
14641
  case GGML_OP_DIV:
14642
  {
14643
+ ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
14644
  } break;
14645
  case GGML_OP_SQR:
14646
  {
14647
+ ggml_compute_forward_sqr(params, tensor->src[0], tensor);
14648
  } break;
14649
  case GGML_OP_SQRT:
14650
  {
14651
+ ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
14652
  } break;
14653
  case GGML_OP_LOG:
14654
  {
14655
+ ggml_compute_forward_log(params, tensor->src[0], tensor);
14656
  } break;
14657
  case GGML_OP_SUM:
14658
  {
14659
+ ggml_compute_forward_sum(params, tensor->src[0], tensor);
14660
  } break;
14661
  case GGML_OP_SUM_ROWS:
14662
  {
14663
+ ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
14664
  } break;
14665
  case GGML_OP_MEAN:
14666
  {
14667
+ ggml_compute_forward_mean(params, tensor->src[0], tensor);
14668
  } break;
14669
  case GGML_OP_ARGMAX:
14670
  {
14671
+ ggml_compute_forward_argmax(params, tensor->src[0], tensor);
14672
  } break;
14673
  case GGML_OP_REPEAT:
14674
  {
14675
+ ggml_compute_forward_repeat(params, tensor->src[0], tensor);
14676
  } break;
14677
  case GGML_OP_REPEAT_BACK:
14678
  {
14679
+ ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14680
  } break;
14681
  case GGML_OP_ABS:
14682
  {
14683
+ ggml_compute_forward_abs(params, tensor->src[0], tensor);
14684
  } break;
14685
  case GGML_OP_SGN:
14686
  {
14687
+ ggml_compute_forward_sgn(params, tensor->src[0], tensor);
14688
  } break;
14689
  case GGML_OP_NEG:
14690
  {
14691
+ ggml_compute_forward_neg(params, tensor->src[0], tensor);
14692
  } break;
14693
  case GGML_OP_STEP:
14694
  {
14695
+ ggml_compute_forward_step(params, tensor->src[0], tensor);
14696
  } break;
14697
  case GGML_OP_TANH:
14698
  {
14699
+ ggml_compute_forward_tanh(params, tensor->src[0], tensor);
14700
  } break;
14701
  case GGML_OP_ELU:
14702
  {
14703
+ ggml_compute_forward_elu(params, tensor->src[0], tensor);
14704
  } break;
14705
  case GGML_OP_RELU:
14706
  {
14707
+ ggml_compute_forward_relu(params, tensor->src[0], tensor);
14708
  } break;
14709
  case GGML_OP_GELU:
14710
  {
14711
+ ggml_compute_forward_gelu(params, tensor->src[0], tensor);
14712
  } break;
14713
  case GGML_OP_GELU_QUICK:
14714
  {
14715
+ ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor);
14716
  } break;
14717
  case GGML_OP_SILU:
14718
  {
14719
+ ggml_compute_forward_silu(params, tensor->src[0], tensor);
14720
  } break;
14721
  case GGML_OP_SILU_BACK:
14722
  {
14723
+ ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
14724
  } break;
14725
  case GGML_OP_NORM:
14726
  {
14727
+ ggml_compute_forward_norm(params, tensor->src[0], tensor);
14728
  } break;
14729
  case GGML_OP_RMS_NORM:
14730
  {
14731
+ ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
14732
  } break;
14733
  case GGML_OP_RMS_NORM_BACK:
14734
  {
14735
+ ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
14736
  } break;
14737
  case GGML_OP_MUL_MAT:
14738
  {
14739
+ ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
14740
  } break;
14741
  case GGML_OP_OUT_PROD:
14742
  {
14743
+ ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
14744
  } break;
14745
  case GGML_OP_SCALE:
14746
  {
14747
+ ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor);
14748
  } break;
14749
  case GGML_OP_SET:
14750
  {
14751
+ ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14752
  } break;
14753
  case GGML_OP_CPY:
14754
  {
14755
+ ggml_compute_forward_cpy(params, tensor->src[0], tensor);
14756
  } break;
14757
  case GGML_OP_CONT:
14758
  {
14759
+ ggml_compute_forward_cont(params, tensor->src[0], tensor);
14760
  } break;
14761
  case GGML_OP_RESHAPE:
14762
  {
14763
+ ggml_compute_forward_reshape(params, tensor->src[0], tensor);
14764
  } break;
14765
  case GGML_OP_VIEW:
14766
  {
14767
+ ggml_compute_forward_view(params, tensor->src[0]);
14768
  } break;
14769
  case GGML_OP_PERMUTE:
14770
  {
14771
+ ggml_compute_forward_permute(params, tensor->src[0]);
14772
  } break;
14773
  case GGML_OP_TRANSPOSE:
14774
  {
14775
+ ggml_compute_forward_transpose(params, tensor->src[0]);
14776
  } break;
14777
  case GGML_OP_GET_ROWS:
14778
  {
14779
+ ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
14780
  } break;
14781
  case GGML_OP_GET_ROWS_BACK:
14782
  {
14783
+ ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14784
  } break;
14785
  case GGML_OP_DIAG:
14786
  {
14787
+ ggml_compute_forward_diag(params, tensor->src[0], tensor);
14788
  } break;
14789
  case GGML_OP_DIAG_MASK_INF:
14790
  {
14791
+ ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor);
14792
  } break;
14793
  case GGML_OP_DIAG_MASK_ZERO:
14794
  {
14795
+ ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor);
14796
  } break;
14797
  case GGML_OP_SOFT_MAX:
14798
  {
14799
+ ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
14800
  } break;
14801
  case GGML_OP_SOFT_MAX_BACK:
14802
  {
14803
+ ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
14804
  } break;
14805
  case GGML_OP_ROPE:
14806
  {
14807
+ ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
14808
  } break;
14809
  case GGML_OP_ROPE_BACK:
14810
  {
14811
+ ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
14812
  } break;
14813
  case GGML_OP_ALIBI:
14814
  {
14815
+ ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor);
14816
  } break;
14817
  case GGML_OP_CLAMP:
14818
  {
14819
+ ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor);
14820
  } break;
14821
  case GGML_OP_CONV_1D:
14822
  {
14823
+ ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14824
  } break;
14825
  case GGML_OP_CONV_2D:
14826
  {
14827
+ ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14828
  } break;
14829
  case GGML_OP_FLASH_ATTN:
14830
  {
14831
+ const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
14832
  GGML_ASSERT(t == 0 || t == 1);
14833
  const bool masked = t != 0;
14834
+ ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
14835
  } break;
14836
  case GGML_OP_FLASH_FF:
14837
  {
14838
+ ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
14839
  } break;
14840
  case GGML_OP_FLASH_ATTN_BACK:
14841
  {
14842
+ int32_t t = ggml_get_i32_1d(tensor->src[4], 0);
14843
  GGML_ASSERT(t == 0 || t == 1);
14844
  bool masked = t != 0;
14845
+ ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
14846
  } break;
14847
  case GGML_OP_WIN_PART:
14848
  {
14849
+ ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor);
14850
  } break;
14851
  case GGML_OP_WIN_UNPART:
14852
  {
14853
+ ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor);
14854
  } break;
14855
  case GGML_OP_MAP_UNARY:
14856
  {
14857
+ const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data);
14858
+ ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
14859
  }
14860
  break;
14861
  case GGML_OP_MAP_BINARY:
14862
  {
14863
+ const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data);
14864
+ ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
14865
  }
14866
  break;
14867
  case GGML_OP_MAP_CUSTOM1:
14868
  {
14869
+ const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data);
14870
+ ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
14871
  }
14872
  break;
14873
  case GGML_OP_MAP_CUSTOM2:
14874
  {
14875
+ const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data);
14876
+ ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
14877
  }
14878
  break;
14879
  case GGML_OP_MAP_CUSTOM3:
14880
  {
14881
+ const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data);
14882
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun);
14883
  }
14884
  break;
14885
  case GGML_OP_CROSS_ENTROPY_LOSS:
14886
  {
14887
+ ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
14888
  }
14889
  break;
14890
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
14891
  {
14892
+ ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
14893
  }
14894
  break;
14895
  case GGML_OP_NONE:
 
14906
  ////////////////////////////////////////////////////////////////////////////////
14907
 
14908
  static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
14909
+ struct ggml_tensor * src0 = tensor->src[0];
14910
+ struct ggml_tensor * src1 = tensor->src[1];
14911
 
14912
  switch (tensor->op) {
14913
  case GGML_OP_DUP:
 
14943
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
14944
  }
14945
  if (src1->grad) {
14946
+ GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
14947
+ GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
14948
+ const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
14949
+ const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
14950
+ const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
14951
+ const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
14952
 
14953
  struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
14954
  tensor->grad,
 
15256
  } break;
15257
  case GGML_OP_SET:
15258
  {
15259
+ GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5);
15260
+ GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32);
15261
+ const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0];
15262
+ const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1];
15263
+ const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2];
15264
+ const size_t offset = (( int32_t * ) tensor->src[2]->data)[3];
15265
 
15266
  struct ggml_tensor * tensor_grad_view = NULL;
15267
 
 
15338
  if (src0->grad) {
15339
  size_t offset;
15340
 
15341
+ GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2]));
15342
+ memcpy(&offset, tensor->src[2]->data, sizeof(offset));
15343
 
15344
  size_t nb1 = tensor->nb[1];
15345
  size_t nb2 = tensor->nb[2];
 
15366
  {
15367
  // necessary for llama
15368
  if (src0->grad) {
15369
+ int32_t * axes = (int32_t *) tensor->src[2]->data;
15370
  int axis0 = axes[0] & 0x3;
15371
  int axis1 = axes[1] & 0x3;
15372
  int axis2 = axes[2] & 0x3;
 
15529
  case GGML_OP_FLASH_ATTN:
15530
  {
15531
  struct ggml_tensor * flash_grad = NULL;
15532
+ if (src0->grad || src1->grad || tensor->src[2]->grad) {
15533
+ int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
15534
  GGML_ASSERT(t == 0 || t == 1);
15535
  bool masked = t != 0;
15536
  flash_grad =
15537
  ggml_flash_attn_back(ctx,
15538
  src0,
15539
  src1,
15540
+ tensor->src[2],
15541
  tensor->grad,
15542
  masked);
15543
  }
 
15634
  inplace);
15635
  }
15636
 
15637
+ struct ggml_tensor * opt0 = tensor->src[2];
15638
 
15639
  if (opt0->grad) {
15640
  struct ggml_tensor * grad_v = NULL;
 
15750
  }
15751
  }
15752
 
15753
+ for (int i = 0; i < GGML_MAX_SRC; ++i) {
15754
+ if (node->src[i]) {
15755
+ ggml_visit_parents(cgraph, node->src[i]);
 
 
 
 
 
 
 
 
15756
  }
15757
  }
15758
 
 
15807
  struct ggml_cgraph result = {
15808
  /*.n_nodes =*/ 0,
15809
  /*.n_leafs =*/ 0,
 
 
 
15810
  /*.nodes =*/ { NULL },
15811
  /*.grads =*/ { NULL },
15812
  /*.leafs =*/ { NULL },
 
15977
  #endif
15978
 
15979
  struct ggml_compute_state_shared {
15980
+ const struct ggml_cgraph * cgraph;
15981
+ const struct ggml_cplan * cplan;
15982
 
15983
  int64_t perf_node_start_cycles;
15984
  int64_t perf_node_start_time_us;
15985
 
15986
+ const int n_threads;
15987
 
15988
  // synchronization primitives
15989
  atomic_int n_active; // num active threads
 
16007
 
16008
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16009
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
16010
 
16011
+ const struct ggml_cgraph * cgraph = state->shared->cgraph;
16012
+ const struct ggml_cplan * cplan = state->shared->cplan;
16013
+
16014
+ const int * n_tasks_arr = cplan->n_tasks;
16015
+ const int n_threads = state->shared->n_threads;
16016
+
16017
  set_numa_thread_affinity(state->ith, n_threads);
16018
 
16019
  int node_n = -1;
 
16026
  /*.type =*/ GGML_TASK_FINALIZE,
16027
  /*.ith =*/ 0,
16028
  /*.nth =*/ 0,
16029
+ /*.wsize =*/ cplan->work_size,
16030
+ /*.wdata =*/ cplan->work_data,
16031
  };
16032
 
16033
  if (node_n != -1) {
16034
  /* FINALIZE */
16035
  struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16036
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16037
+ params.nth = n_tasks_arr[node_n];
16038
  ggml_compute_forward(&params, node);
16039
  ggml_graph_compute_perf_stats_node(node, state->shared);
16040
  }
 
16045
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16046
 
16047
  struct ggml_tensor * node = cgraph->nodes[node_n];
16048
+ const int n_tasks = n_tasks_arr[node_n];
16049
 
16050
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16051
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
16052
 
16053
+ params.nth = n_tasks;
16054
 
16055
  /* INIT */
16056
  if (GGML_OP_HAS_INIT[node->op]) {
 
16058
  ggml_compute_forward(&params, node);
16059
  }
16060
 
16061
+ if (n_tasks == 1) {
16062
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16063
  // they do something more efficient than spinning (?)
16064
  params.type = GGML_TASK_COMPUTE;
 
16080
  // wait for other threads to finish
16081
  const int last = node_n;
16082
  do {
16083
+ //sched_yield();
16084
  node_n = atomic_load(&state->shared->node_n);
16085
  } while (node_n == last);
16086
  }
 
16090
 
16091
  /* COMPUTE */
16092
  struct ggml_tensor * node = cgraph->nodes[node_n];
16093
+ const int n_tasks = n_tasks_arr[node_n];
16094
 
16095
  struct ggml_compute_params params = {
16096
  /*.type =*/ GGML_TASK_COMPUTE,
16097
  /*.ith =*/ state->ith,
16098
+ /*.nth =*/ n_tasks,
16099
+ /*.wsize =*/ cplan->work_size,
16100
+ /*.wdata =*/ cplan->work_data,
16101
  };
16102
 
16103
+ if (state->ith < n_tasks) {
16104
  ggml_compute_forward(&params, node);
16105
  }
16106
  }
 
16108
  return 0;
16109
  }
16110
 
16111
+ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16112
+ if (n_threads <= 0) {
16113
+ n_threads = GGML_DEFAULT_N_THREADS;
16114
+ }
16115
 
16116
+ size_t work_size = 0;
 
 
 
 
 
 
 
 
16117
 
16118
+ struct ggml_cplan cplan;
16119
+ memset(&cplan, 0, sizeof(struct ggml_cplan));
 
16120
 
16121
+ // thread scheduling for the different operations + work buffer size estimation
16122
+ for (int i = 0; i < cgraph->n_nodes; i++) {
16123
+ int n_tasks = 1;
16124
 
16125
+ struct ggml_tensor * node = cgraph->nodes[i];
 
 
 
 
16126
 
16127
+ switch (node->op) {
16128
+ case GGML_OP_CPY:
16129
+ case GGML_OP_DUP:
16130
+ {
16131
+ n_tasks = n_threads;
16132
 
16133
+ size_t cur = 0;
16134
+ if (ggml_is_quantized(node->type)) {
16135
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
16136
+ }
 
 
16137
 
16138
+ work_size = MAX(work_size, cur);
16139
+ } break;
16140
+ case GGML_OP_ADD:
16141
+ case GGML_OP_ADD1:
16142
+ {
16143
+ n_tasks = n_threads;
16144
 
16145
+ size_t cur = 0;
 
 
16146
 
16147
+ if (ggml_is_quantized(node->src[0]->type)) {
16148
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
16149
+ }
 
 
16150
 
16151
+ work_size = MAX(work_size, cur);
16152
+ } break;
16153
+ case GGML_OP_ACC:
16154
+ {
16155
+ n_tasks = n_threads;
16156
 
16157
+ size_t cur = 0;
 
 
16158
 
16159
+ if (ggml_is_quantized(node->src[0]->type)) {
16160
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
16161
+ }
16162
+
16163
+ work_size = MAX(work_size, cur);
16164
+ } break;
16165
+ case GGML_OP_SUB:
16166
+ case GGML_OP_DIV:
16167
+ case GGML_OP_SQR:
16168
+ case GGML_OP_SQRT:
16169
+ case GGML_OP_LOG:
16170
+ case GGML_OP_SUM:
16171
+ case GGML_OP_SUM_ROWS:
16172
+ case GGML_OP_MEAN:
16173
+ case GGML_OP_ARGMAX:
16174
+ case GGML_OP_REPEAT:
16175
+ case GGML_OP_REPEAT_BACK:
16176
+ case GGML_OP_ABS:
16177
+ case GGML_OP_SGN:
16178
+ case GGML_OP_NEG:
16179
+ case GGML_OP_STEP:
16180
+ case GGML_OP_TANH:
16181
+ case GGML_OP_ELU:
16182
+ case GGML_OP_RELU:
16183
+ {
16184
+ n_tasks = 1;
16185
+ } break;
16186
+ case GGML_OP_MUL:
16187
+ case GGML_OP_GELU:
16188
+ case GGML_OP_GELU_QUICK:
16189
+ case GGML_OP_SILU:
16190
+ case GGML_OP_SILU_BACK:
16191
+ case GGML_OP_NORM:
16192
+ case GGML_OP_RMS_NORM:
16193
+ case GGML_OP_RMS_NORM_BACK:
16194
+ {
16195
+ n_tasks = n_threads;
16196
+ } break;
16197
+ case GGML_OP_MUL_MAT:
16198
+ case GGML_OP_OUT_PROD:
16199
+ {
16200
+ n_tasks = n_threads;
16201
+
16202
+ // TODO: use different scheduling for different matrix sizes
16203
+ //const int nr0 = ggml_nrows(node->src[0]);
16204
+ //const int nr1 = ggml_nrows(node->src[1]);
16205
+
16206
+ //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16207
+ //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16208
+
16209
+ size_t cur = 0;
16210
+ const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
16211
 
16212
  #if defined(GGML_USE_CUBLAS)
16213
+ if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16214
+ n_tasks = 1; // TODO: this actually is doing nothing
16215
+ // the threads are still spinning
16216
+ } else
 
16217
  #elif defined(GGML_USE_CLBLAST)
16218
+ if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16219
+ n_tasks = 1; // TODO: this actually is doing nothing
16220
+ // the threads are still spinning
16221
+ cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
16222
+ } else
 
16223
  #endif
16224
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16225
+ if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16226
+ n_tasks = 1; // TODO: this actually is doing nothing
16227
+ // the threads are still spinning
16228
+ if (node->src[0]->type != GGML_TYPE_F32) {
16229
+ // here we need memory just for single 2D matrix from src0
16230
+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
 
 
 
 
 
 
 
16231
  }
16232
+ } else
16233
+ #endif
16234
+ if (node->src[1]->type != vec_dot_type) {
16235
+ cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
16236
+ } else {
16237
+ cur = 0;
16238
+ }
16239
 
16240
+ work_size = MAX(work_size, cur);
16241
+ } break;
16242
+ case GGML_OP_SCALE:
16243
+ {
16244
+ n_tasks = 1;
16245
+ } break;
16246
+ case GGML_OP_SET:
16247
+ case GGML_OP_CONT:
16248
+ case GGML_OP_RESHAPE:
16249
+ case GGML_OP_VIEW:
16250
+ case GGML_OP_PERMUTE:
16251
+ case GGML_OP_TRANSPOSE:
16252
+ case GGML_OP_GET_ROWS:
16253
+ case GGML_OP_GET_ROWS_BACK:
16254
+ case GGML_OP_DIAG:
16255
+ case GGML_OP_DIAG_MASK_ZERO:
16256
+ {
16257
+ n_tasks = 1;
16258
+ } break;
16259
+ case GGML_OP_DIAG_MASK_INF:
16260
+ case GGML_OP_SOFT_MAX:
16261
+ case GGML_OP_SOFT_MAX_BACK:
16262
+ case GGML_OP_ROPE:
16263
+ case GGML_OP_ROPE_BACK:
16264
+ {
16265
+ n_tasks = n_threads;
16266
+ } break;
16267
+ case GGML_OP_ALIBI:
16268
+ {
16269
+ n_tasks = 1; //TODO
16270
+ } break;
16271
+ case GGML_OP_CLAMP:
16272
+ {
16273
+ n_tasks = 1; //TODO
16274
+ } break;
16275
+ case GGML_OP_CONV_1D:
16276
+ {
16277
+ n_tasks = n_threads;
16278
+
16279
+ GGML_ASSERT(node->src[0]->ne[3] == 1);
16280
+ GGML_ASSERT(node->src[1]->ne[2] == 1);
16281
+ GGML_ASSERT(node->src[1]->ne[3] == 1);
16282
+
16283
+ size_t cur = 0;
16284
+ const int nk = node->src[0]->ne[0];
16285
+
16286
+ if (node->src[0]->type == GGML_TYPE_F16 &&
16287
+ node->src[1]->type == GGML_TYPE_F32) {
16288
+ cur = sizeof(ggml_fp16_t)*(
16289
+ nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
16290
+ ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
16291
+ );
16292
+ } else if (node->src[0]->type == GGML_TYPE_F32 &&
16293
+ node->src[1]->type == GGML_TYPE_F32) {
16294
+ cur = sizeof(float)*(
16295
+ nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
16296
+ ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
16297
+ );
16298
+ } else {
16299
+ GGML_ASSERT(false);
16300
+ }
16301
 
16302
+ work_size = MAX(work_size, cur);
16303
+ } break;
16304
+ case GGML_OP_CONV_2D:
16305
+ {
16306
+ n_tasks = n_threads;
16307
 
16308
+ GGML_ASSERT(node->src[1]->ne[3] == 1);
16309
 
16310
+ const int64_t ne00 = node->src[0]->ne[0]; // W
16311
+ const int64_t ne01 = node->src[0]->ne[1]; // H
16312
+ const int64_t ne02 = node->src[0]->ne[2]; // C
16313
+ const int64_t ne03 = node->src[0]->ne[3]; // N
16314
 
16315
+ const int64_t ne10 = node->src[1]->ne[0]; // W
16316
+ const int64_t ne11 = node->src[1]->ne[1]; // H
16317
+ const int64_t ne12 = node->src[1]->ne[2]; // C
16318
 
16319
+ const int64_t nk = ne00*ne01;
16320
 
16321
+ UNUSED(ne02);
16322
+ UNUSED(ne03);
16323
+ UNUSED(nk);
16324
 
16325
+ size_t cur = 0;
16326
 
16327
+ if (node->src[0]->type == GGML_TYPE_F16 &&
16328
+ node->src[1]->type == GGML_TYPE_F32) {
16329
+ cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16330
+ } else if (node->src[0]->type == GGML_TYPE_F32 &&
16331
+ node->src[1]->type == GGML_TYPE_F32) {
16332
+ cur = sizeof(float)* (ne10*ne11*ne12);
16333
+ } else {
16334
+ GGML_ASSERT(false);
16335
+ }
16336
 
16337
+ work_size = MAX(work_size, cur);
16338
+ } break;
16339
+ case GGML_OP_FLASH_ATTN:
16340
+ {
16341
+ n_tasks = n_threads;
16342
 
16343
+ size_t cur = 0;
16344
 
16345
+ const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16346
 
16347
+ if (node->src[1]->type == GGML_TYPE_F32) {
16348
+ cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16349
+ cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16350
+ }
16351
 
16352
+ if (node->src[1]->type == GGML_TYPE_F16) {
16353
+ cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16354
+ cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16355
+ }
16356
 
16357
+ work_size = MAX(work_size, cur);
16358
+ } break;
16359
+ case GGML_OP_FLASH_FF:
16360
+ {
16361
+ n_tasks = n_threads;
16362
 
16363
+ size_t cur = 0;
16364
 
16365
+ if (node->src[1]->type == GGML_TYPE_F32) {
16366
+ cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16367
+ cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16368
+ }
16369
 
16370
+ if (node->src[1]->type == GGML_TYPE_F16) {
16371
+ cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16372
+ cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16373
+ }
16374
 
16375
+ work_size = MAX(work_size, cur);
16376
+ } break;
16377
+ case GGML_OP_FLASH_ATTN_BACK:
16378
+ {
16379
+ n_tasks = n_threads;
16380
 
16381
+ size_t cur = 0;
16382
 
16383
+ const int64_t D = node->src[0]->ne[0];
16384
+ const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16385
+ const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
16386
+ if (node->src[1]->type == GGML_TYPE_F32) {
16387
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16388
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16389
+ }
16390
 
16391
+ if (node->src[1]->type == GGML_TYPE_F16) {
16392
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16393
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16394
+ }
16395
 
16396
+ work_size = MAX(work_size, cur);
16397
+ } break;
16398
+ case GGML_OP_WIN_PART:
16399
+ case GGML_OP_WIN_UNPART:
16400
+ case GGML_OP_MAP_UNARY:
16401
+ case GGML_OP_MAP_BINARY:
16402
+ case GGML_OP_MAP_CUSTOM1:
16403
+ case GGML_OP_MAP_CUSTOM2:
16404
+ case GGML_OP_MAP_CUSTOM3:
16405
+ {
16406
+ n_tasks = 1;
16407
+ } break;
16408
+ case GGML_OP_CROSS_ENTROPY_LOSS:
16409
+ {
16410
+ n_tasks = n_threads;
16411
+
16412
+ size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16413
+
16414
+ work_size = MAX(work_size, cur);
16415
+ } break;
16416
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16417
+ {
16418
+ n_tasks = n_threads;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16419
 
16420
+ size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
16421
+
16422
+ work_size = MAX(work_size, cur);
16423
+ } break;
16424
+ case GGML_OP_NONE:
16425
+ {
16426
+ n_tasks = 1;
16427
+ } break;
16428
+ case GGML_OP_COUNT:
16429
+ {
16430
+ GGML_ASSERT(false);
16431
+ } break;
16432
  }
16433
 
16434
+ cplan.n_tasks[i] = n_tasks;
16435
+ }
16436
 
16437
+ if (work_size > 0) {
16438
+ work_size += CACHE_LINE_SIZE*(n_threads - 1);
16439
+ }
16440
+
16441
+ cplan.n_threads = n_threads;
16442
+ cplan.work_size = work_size;
16443
+ cplan.work_data = NULL;
16444
+
16445
+ return cplan;
16446
+ }
16447
+
16448
+ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16449
+ {
16450
+ GGML_ASSERT(cplan);
16451
+ GGML_ASSERT(cplan->n_threads > 0);
16452
+
16453
+ if (cplan->work_size > 0) {
16454
+ GGML_ASSERT(cplan->work_data);
16455
+ }
16456
+
16457
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
16458
+ if (cgraph->nodes[i]->op != GGML_OP_NONE) {
16459
+ GGML_ASSERT(cplan->n_tasks[i] > 0);
16460
+ }
16461
  }
16462
  }
16463
 
16464
+ const int n_threads = cplan->n_threads;
16465
+
16466
+ struct ggml_compute_state_shared state_shared = {
16467
+ /*.cgraph =*/ cgraph,
16468
+ /*.cgraph_plan =*/ cplan,
16469
+ /*.perf_node_start_cycles =*/ 0,
16470
+ /*.perf_node_start_time_us =*/ 0,
16471
+ /*.n_threads =*/ n_threads,
16472
+ /*.n_active =*/ n_threads,
16473
+ /*.node_n =*/ -1,
16474
+ };
16475
+ struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
16476
+
16477
  // create thread pool
16478
  if (n_threads > 1) {
16479
  for (int j = 1; j < n_threads; ++j) {
 
16535
  }
16536
  }
16537
 
16538
+ void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16539
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16540
+
16541
+ struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16542
+ GGML_ASSERT(buf);
16543
+
16544
+ cplan.work_data = buf->data;
16545
+
16546
+ ggml_graph_compute(cgraph, &cplan);
16547
+ }
16548
+
16549
  struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
16550
  for (int i = 0; i < cgraph->n_leafs; i++) {
16551
  struct ggml_tensor * leaf = cgraph->leafs[i];
 
16584
  const int64_t * ne = tensor->ne;
16585
  const size_t * nb = tensor->nb;
16586
 
16587
+ fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
16588
  arg,
16589
  ggml_type_name(tensor->type),
16590
  ggml_op_name (tensor->op),
16591
  tensor->n_dims,
16592
  ne[0], ne[1], ne[2], ne[3],
16593
  nb[0], nb[1], nb[2], nb[3],
 
16594
  tensor->data,
16595
  tensor->name);
16596
  }
 
16627
  ggml_graph_export_leaf(cgraph->leafs[i], fout);
16628
 
16629
  GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
16630
+ GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
16631
+ GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
16632
  }
16633
 
16634
  // header
 
16639
  for (int i = 0; i < cgraph->n_nodes; ++i) {
16640
  ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
16641
 
16642
+ for (int j = 0; j < GGML_MAX_SRC; ++j) {
16643
+ if (cgraph->nodes[i]->src[j]) {
16644
+ ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
 
 
 
 
 
 
 
 
16645
  }
16646
  }
16647
 
 
16732
 
16733
  // output the op arguments
16734
  {
16735
+ struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
16736
 
16737
+ for (int j = 0; j < GGML_MAX_SRC; ++j) {
16738
+ args[j] = tensor->src[j];
 
 
 
16739
  }
16740
 
16741
+ for (int j = 0; j < GGML_MAX_SRC; ++j) {
16742
  if (args[j]) {
16743
  int32_t idx = -1;
16744
 
 
16956
 
16957
  const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
16958
 
16959
+ const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
16960
 
16961
+ struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
16962
 
16963
  // parse args
16964
+ for (int j = 0; j < GGML_MAX_SRC; ++j) {
16965
  const int32_t arg_idx = ptr_arg_idx[j];
16966
 
16967
  if (arg_idx == -1) {
 
17018
  tensor->nb[j] = nb[j];
17019
  }
17020
 
17021
+ for (int j = 0; j < GGML_MAX_SRC; ++j) {
17022
+ tensor->src[j] = args[j];
 
 
 
17023
  }
17024
 
17025
  result.nodes[i] = tensor;
 
17218
  for (int i = 0; i < gb->n_nodes; i++) {
17219
  struct ggml_tensor * node = gb->nodes[i];
17220
 
17221
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
17222
+ if (node->src[j]) {
 
 
 
 
 
 
 
 
17223
  char label[16];
17224
+ snprintf(label, sizeof(label), "src %d", j);
17225
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
17226
  }
17227
  }
17228
  }
 
17230
  for (int i = 0; i < gb->n_leafs; i++) {
17231
  struct ggml_tensor * node = gb->leafs[i];
17232
 
17233
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
17234
+ if (node->src[j]) {
 
 
 
 
 
 
 
 
17235
  char label[16];
17236
+ snprintf(label, sizeof(label), "src %d", j);
17237
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
17238
  }
17239
  }
17240
  }
 
17296
  struct ggml_cgraph * gb) {
17297
  GGML_ASSERT(ggml_is_scalar(f));
17298
 
 
 
 
17299
  // these will store the parameters we want to optimize
17300
  struct ggml_tensor * ps[GGML_MAX_PARAMS];
17301
 
 
17342
  // compute the function value
17343
  ggml_graph_reset (gf);
17344
  ggml_set_f32 (f->grad, 1.0f);
17345
+
17346
+ ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
17347
 
17348
  opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
17349
  opt->adam.fx_best = opt->adam.fx_prev;
 
17423
 
17424
  ggml_graph_reset (gf);
17425
  ggml_set_f32 (f->grad, 1.0f);
17426
+
17427
+ ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
17428
 
17429
  const float fx = ggml_get_f32_1d(f, 0);
17430
 
 
17546
 
17547
  ggml_graph_reset (gf);
17548
  ggml_set_f32 (f->grad, 1.0f);
17549
+
17550
+ ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
17551
 
17552
  ggml_opt_get_grad(np, ps, g);
17553
 
 
17615
  }
17616
  }
17617
 
 
 
 
17618
  const int m = params.lbfgs.m;
17619
 
17620
  // these will store the parameters we want to optimize
 
17666
 
17667
  ggml_graph_reset (gf);
17668
  ggml_set_f32 (f->grad, 1.0f);
17669
+
17670
+ ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
17671
 
17672
  ggml_opt_get_grad(np, ps, g);
17673
 
ggml.h CHANGED
@@ -65,7 +65,7 @@
65
  // ggml_set_f32(a, 3.0f);
66
  // ggml_set_f32(b, 4.0f);
67
  //
68
- // ggml_graph_compute(ctx0, &gf);
69
  //
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
  //
@@ -132,10 +132,10 @@
132
  // {
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
  //
135
- // // a[1, 2] = 1.0f;
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
  //
138
- // // a[2, 0] = 2.0f;
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
  //
141
  // ...
@@ -197,12 +197,18 @@
197
  #define GGML_MAX_NODES 4096
198
  #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
- #define GGML_MAX_OPT 4
201
  #define GGML_MAX_NAME 48
202
  #define GGML_DEFAULT_N_THREADS 4
203
 
204
  #define GGML_UNUSED(x) (void)(x)
205
 
 
 
 
 
 
 
206
  #define GGML_ASSERT(x) \
207
  do { \
208
  if (!(x)) { \
@@ -414,12 +420,7 @@ extern "C" {
414
  bool is_param;
415
 
416
  struct ggml_tensor * grad;
417
- struct ggml_tensor * src0;
418
- struct ggml_tensor * src1;
419
- struct ggml_tensor * opt[GGML_MAX_OPT];
420
-
421
- // thread scheduling
422
- int n_tasks;
423
 
424
  // performance
425
  int perf_runs;
@@ -432,19 +433,27 @@ extern "C" {
432
 
433
  void * extra; // extra things e.g. for ggml-cuda.cu
434
 
435
- char padding[4];
436
  };
437
 
438
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
439
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  // computation graph
441
  struct ggml_cgraph {
442
  int n_nodes;
443
  int n_leafs;
444
- int n_threads;
445
-
446
- size_t work_size;
447
- struct ggml_tensor * work;
448
 
449
  struct ggml_tensor * nodes[GGML_MAX_NODES];
450
  struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -532,6 +541,8 @@ extern "C" {
532
  // use this to compute the memory overhead of a tensor
533
  GGML_API size_t ggml_tensor_overhead(void);
534
 
 
 
535
  GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
536
 
537
  // main
@@ -1292,15 +1303,22 @@ extern "C" {
1292
 
1293
  GGML_API void ggml_set_param(
1294
  struct ggml_context * ctx,
1295
- struct ggml_tensor * tensor);
1296
 
1297
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1298
 
1299
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1300
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1301
 
1302
- GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1303
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
 
 
 
 
 
 
 
1304
 
1305
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1306
 
 
65
  // ggml_set_f32(a, 3.0f);
66
  // ggml_set_f32(b, 4.0f);
67
  //
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
  //
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
  //
 
132
  // {
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
  //
135
+ // // a[2, 1] = 1.0f;
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
  //
138
+ // // a[0, 2] = 2.0f;
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
  //
141
  // ...
 
197
  #define GGML_MAX_NODES 4096
198
  #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
+ #define GGML_MAX_SRC 6
201
  #define GGML_MAX_NAME 48
202
  #define GGML_DEFAULT_N_THREADS 4
203
 
204
  #define GGML_UNUSED(x) (void)(x)
205
 
206
+ // Maximum training context of the model in use
207
+ // For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
208
+ #ifndef GGML_TRAINING_CTX
209
+ #define GGML_TRAINING_CTX 2048
210
+ #endif
211
+
212
  #define GGML_ASSERT(x) \
213
  do { \
214
  if (!(x)) { \
 
420
  bool is_param;
421
 
422
  struct ggml_tensor * grad;
423
+ struct ggml_tensor * src[GGML_MAX_SRC];
 
 
 
 
 
424
 
425
  // performance
426
  int perf_runs;
 
433
 
434
  void * extra; // extra things e.g. for ggml-cuda.cu
435
 
436
+ char padding[8];
437
  };
438
 
439
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
440
 
441
+ // the compute plan that needs to be prepared for ggml_graph_compute()
442
+ // since https://github.com/ggerganov/ggml/issues/287
443
+ struct ggml_cplan {
444
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
445
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
446
+
447
+ int n_threads;
448
+
449
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
450
+ int n_tasks[GGML_MAX_NODES];
451
+ };
452
+
453
  // computation graph
454
  struct ggml_cgraph {
455
  int n_nodes;
456
  int n_leafs;
 
 
 
 
457
 
458
  struct ggml_tensor * nodes[GGML_MAX_NODES];
459
  struct ggml_tensor * grads[GGML_MAX_NODES];
 
541
  // use this to compute the memory overhead of a tensor
542
  GGML_API size_t ggml_tensor_overhead(void);
543
 
544
+ GGML_API void set_ntk_rope_scale_mode(bool useNtk);
545
+ GGML_API bool get_ntk_rope_scale_mode();
546
  GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
547
 
548
  // main
 
1303
 
1304
  GGML_API void ggml_set_param(
1305
  struct ggml_context * ctx,
1306
+ struct ggml_tensor * tensor);
1307
 
1308
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1309
 
1310
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1311
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1312
 
1313
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1314
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1315
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1316
+ GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1317
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1318
+
1319
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1320
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1321
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1322
 
1323
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1324
 
gpttype_adapter.cpp CHANGED
@@ -33,6 +33,8 @@ std::string executable_path = "";
33
  std::string lora_filename = "";
34
  std::string lora_base = "";
35
  bool generation_finished;
 
 
36
  std::vector<std::string> generated_tokens;
37
 
38
  //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
@@ -346,6 +348,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
346
  = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
347
  = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
348
 
 
 
 
 
 
 
 
349
  //handle custom token bans
350
  banned_tokens.clear();
351
  for(int x=0;x<ban_token_max;++x)
@@ -563,7 +572,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
563
  rwkv_ctx_v3->logits_out = (float *)malloc(logitbufsiz);
564
  rwkv_ctx_v3->state_in = nullptr;
565
 
566
- bool testeval = rwkv_eval(rwkv_ctx_v3, 0, rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
567
  if (!testeval)
568
  {
569
  printf("\nError: RWKV Init Eval Failed!\n");
@@ -832,6 +841,7 @@ const std::string & gpttype_get_pending_output()
832
 
833
  generation_outputs gpttype_generate(const generation_inputs inputs, generation_outputs &output)
834
  {
 
835
  stop_sequence.clear();
836
  for(int x=0;x<stop_token_max;++x)
837
  {
@@ -964,7 +974,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
964
  stopper_unused_tokens = 0;
965
  int input_consumed = 0;
966
  std::mt19937 rng(params.seed);
967
- concat_output = "";
968
 
969
  //prepare sampler order
970
  std::vector<samplers> sampler_order;
@@ -1162,12 +1171,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1162
  {
1163
  if(embd.size()>1)
1164
  {
1165
- evalres = rwkv_eval_sequence(rwkv_ctx_v3, (uint32_t*)embd.data(), embd.size(), rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
1166
  }
1167
  else
1168
  {
1169
  bool ignoreLogits = (!startedsampling && ((int)embd_inp.size() > input_consumed + 2));
1170
- evalres = rwkv_eval(rwkv_ctx_v3, embd[0], rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, ignoreLogits?nullptr:rwkv_ctx_v3->logits_out);
1171
  }
1172
 
1173
  memcpy(logits.data(), rwkv_ctx_v3->logits_out, sizeof(float) * rwkv_vocab.size());
@@ -1438,6 +1447,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1438
  fflush(stdout);
1439
  output.status = 1;
1440
  generation_finished = true;
 
 
1441
  snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
1442
 
1443
  return output;
 
33
  std::string lora_filename = "";
34
  std::string lora_base = "";
35
  bool generation_finished;
36
+ float last_process_time = 0;
37
+ float last_eval_time = 0;
38
  std::vector<std::string> generated_tokens;
39
 
40
  //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
 
348
  = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
349
  = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
350
 
351
+ //handle linear rope
352
+ if(inputs.linear_rope)
353
+ {
354
+ printf("Using Linear RoPE scaling instead of NTK-Aware scaling.\n");
355
+ }
356
+ set_ntk_rope_scale_mode(!inputs.linear_rope);
357
+
358
  //handle custom token bans
359
  banned_tokens.clear();
360
  for(int x=0;x<ban_token_max;++x)
 
572
  rwkv_ctx_v3->logits_out = (float *)malloc(logitbufsiz);
573
  rwkv_ctx_v3->state_in = nullptr;
574
 
575
+ bool testeval = rwkv_eval(rwkv_ctx_v3, params.n_threads, 0, rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
576
  if (!testeval)
577
  {
578
  printf("\nError: RWKV Init Eval Failed!\n");
 
841
 
842
  generation_outputs gpttype_generate(const generation_inputs inputs, generation_outputs &output)
843
  {
844
+ concat_output = "";
845
  stop_sequence.clear();
846
  for(int x=0;x<stop_token_max;++x)
847
  {
 
974
  stopper_unused_tokens = 0;
975
  int input_consumed = 0;
976
  std::mt19937 rng(params.seed);
 
977
 
978
  //prepare sampler order
979
  std::vector<samplers> sampler_order;
 
1171
  {
1172
  if(embd.size()>1)
1173
  {
1174
+ evalres = rwkv_eval_sequence(rwkv_ctx_v3, params.n_threads, (uint32_t*)embd.data(), embd.size(), rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
1175
  }
1176
  else
1177
  {
1178
  bool ignoreLogits = (!startedsampling && ((int)embd_inp.size() > input_consumed + 2));
1179
+ evalres = rwkv_eval(rwkv_ctx_v3, params.n_threads, embd[0], rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, ignoreLogits?nullptr:rwkv_ctx_v3->logits_out);
1180
  }
1181
 
1182
  memcpy(logits.data(), rwkv_ctx_v3->logits_out, sizeof(float) * rwkv_vocab.size());
 
1447
  fflush(stdout);
1448
  output.status = 1;
1449
  generation_finished = true;
1450
+ last_eval_time = pt2;
1451
+ last_process_time = pt1;
1452
  snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
1453
 
1454
  return output;
klite.embd CHANGED
The diff for this file is too large to render. See raw diff
 
koboldcpp.py CHANGED
@@ -36,6 +36,7 @@ class load_model_inputs(ctypes.Structure):
36
  ("debugmode", ctypes.c_int),
37
  ("forceversion", ctypes.c_int),
38
  ("gpulayers", ctypes.c_int),
 
39
  ("banned_tokens", ctypes.c_char_p * ban_token_max)]
40
 
41
  class generation_inputs(ctypes.Structure):
@@ -160,6 +161,8 @@ def init_library():
160
  handle.new_token.argtypes = [ctypes.c_int]
161
  handle.get_stream_count.restype = ctypes.c_int
162
  handle.has_finished.restype = ctypes.c_bool
 
 
163
  handle.abort_generate.restype = ctypes.c_bool
164
  handle.get_pending_output.restype = ctypes.c_char_p
165
 
@@ -186,15 +189,18 @@ def load_model(model_filename):
186
  inputs.blasbatchsize = args.blasbatchsize
187
  inputs.forceversion = args.forceversion
188
  inputs.gpulayers = args.gpulayers
 
189
  clblastids = 0
190
  if args.useclblast:
191
  clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
192
  inputs.clblast_info = clblastids
193
  inputs.cublas_info = 0
194
- if (args.usecublas and "1" in args.usecublas):
195
- inputs.cublas_info = 1
 
 
196
  elif (args.usecublas and "2" in args.usecublas):
197
- inputs.cublas_info = 2
198
  inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
199
  inputs.debugmode = args.debugmode
200
  banned_tokens = args.bantokens
@@ -236,6 +242,8 @@ def generate(prompt,max_length=20, max_context_length=512, temperature=0.8, top_
236
  for i, sampler in enumerate(sampler_order):
237
  inputs.sampler_order[i] = sampler
238
  inputs.sampler_len = len(sampler_order)
 
 
239
  except TypeError as e:
240
  print("ERROR: sampler_order must be a list of integers: " + str(e))
241
  inputs.seed = seed
@@ -267,7 +275,7 @@ maxhordectx = 1024
267
  maxhordelen = 256
268
  modelbusy = False
269
  defaultport = 5001
270
- KcppVersion = "1.34"
271
  showdebug = True
272
 
273
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
@@ -447,6 +455,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
447
  elif self.path.endswith(('/api/extra/version')):
448
  response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion}).encode())
449
 
 
 
 
 
 
450
  if response_body is None:
451
  self.send_response(404)
452
  self.end_headers()
@@ -524,7 +537,6 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
524
  newprompt = fullprompt
525
 
526
  gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag))
527
-
528
  try:
529
  self.send_response(200)
530
  self.end_headers()
@@ -604,14 +616,13 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None):
604
 
605
  # note: customtkinter-5.2.0
606
  def show_new_gui():
607
- import customtkinter as ctk
608
  from tkinter.filedialog import askopenfilename
609
  from tkinter.filedialog import asksaveasfile
610
 
611
  # if args received, launch
612
  if len(sys.argv) != 1:
613
- root = ctk.CTk()
614
- #we dont want the useless window to be visible, but we want it in taskbar
615
  root.attributes("-alpha", 0)
616
  args.model_param = askopenfilename(title="Select ggml model .bin files")
617
  root.destroy()
@@ -621,6 +632,8 @@ def show_new_gui():
621
  sys.exit(2)
622
  return
623
 
 
 
624
  nextstate = 0 #0=exit, 1=launch, 2=oldgui
625
  windowwidth = 520
626
  windowheight = 500
@@ -762,21 +775,27 @@ def show_new_gui():
762
  quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 4, 50)
763
  quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3)
764
  quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly")
 
765
  quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 5)
766
 
767
- # hides gpu options when CLBlast is not chosen
768
  def changerunmode(a,b,c):
769
  index = runopts_var.get()
770
  if index == "Use CLBlast" or index == "Use CuBLAS":
771
  gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
772
- gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
773
  quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
774
- quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
 
 
 
 
 
775
  else:
776
  gpu_selector_label.grid_forget()
777
  gpu_selector_box.grid_forget()
 
778
  quick_gpu_selector_label.grid_forget()
779
  quick_gpu_selector_box.grid_forget()
 
780
 
781
  if index == "Use CuBLAS":
782
  lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
@@ -827,6 +846,7 @@ def show_new_gui():
827
  gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 4, 50)
828
  gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3)
829
  gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly")
 
830
  lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 5)
831
 
832
  # presets selector
@@ -927,24 +947,7 @@ def show_new_gui():
927
  root.destroy()
928
  pass
929
 
930
- ctk.CTkButton(tabs , text = "Launch", fg_color="#2f8d3c", command = guilaunch, width=80, height = 35 ).grid(row=1,column=1, stick="se", padx= 25, pady=5)
931
-
932
- # ctk.CTkButton(tabs , text = "Save", fg_color="#084a66", command = save_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 5, pady=5)
933
- # ctk.CTkButton(tabs , text = "Load", fg_color="#084a66", command = load_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 70, pady=5)
934
-
935
- ctk.CTkButton(tabs , text = "Old GUI", fg_color="#084a66", command = switch_old_gui, width=100, height = 35 ).grid(row=1,column=0, stick="sw", padx= 5, pady=5)
936
- # runs main loop until closed or launch clicked
937
- root.mainloop()
938
-
939
- if nextstate==0:
940
- print("Exiting by user request.")
941
- time.sleep(2)
942
- sys.exit()
943
- elif nextstate==2:
944
- time.sleep(0.1)
945
- show_old_gui()
946
- else:
947
- # processing vars
948
  args.threads = int(threads_var.get())
949
 
950
  args.usemlock = usemlock.get() == 1
@@ -957,11 +960,16 @@ def show_new_gui():
957
  args.smartcontext = smartcontext.get()==1
958
  args.unbantokens = unbantokens.get()==1
959
 
960
- gpuchoiceidx = int(gpu_choice_var.get())-1
 
 
961
  if runopts_var.get() == runopts[1]:
962
  args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx]
963
  if runopts_var.get() == runopts[2]:
964
- args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)]
 
 
 
965
  if gpulayers_var.get():
966
  args.gpulayers = int(gpulayers_var.get())
967
  if runopts_var.get()==runopts[3]:
@@ -972,9 +980,6 @@ def show_new_gui():
972
  args.noavx2 = True
973
  args.noblas = True
974
  args.nommap = True
975
- print("[Failsafe Mode : mmap is disabled.]")
976
-
977
-
978
 
979
  args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
980
 
@@ -992,6 +997,120 @@ def show_new_gui():
992
 
993
  args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get()]
994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995
  if not args.model_param:
996
  print("\nNo ggml model file was selected. Exiting.")
997
  time.sleep(2)
@@ -1312,6 +1431,7 @@ if __name__ == '__main__':
1312
  parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
1313
  parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048)
1314
  parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
 
1315
  parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
1316
  parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
1317
  parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true')
@@ -1327,7 +1447,7 @@ if __name__ == '__main__':
1327
  compatgroup = parser.add_mutually_exclusive_group()
1328
  compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
1329
  compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
1330
- compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU. Select lowvram to not allocate VRAM scratch buffer. Enter a number after to select a different main GPU.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2'])
1331
  parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
1332
  args = parser.parse_args()
1333
  main(args)
 
36
  ("debugmode", ctypes.c_int),
37
  ("forceversion", ctypes.c_int),
38
  ("gpulayers", ctypes.c_int),
39
+ ("linear_rope", ctypes.c_bool),
40
  ("banned_tokens", ctypes.c_char_p * ban_token_max)]
41
 
42
  class generation_inputs(ctypes.Structure):
 
161
  handle.new_token.argtypes = [ctypes.c_int]
162
  handle.get_stream_count.restype = ctypes.c_int
163
  handle.has_finished.restype = ctypes.c_bool
164
+ handle.get_last_eval_time.restype = ctypes.c_float
165
+ handle.get_last_process_time.restype = ctypes.c_float
166
  handle.abort_generate.restype = ctypes.c_bool
167
  handle.get_pending_output.restype = ctypes.c_char_p
168
 
 
189
  inputs.blasbatchsize = args.blasbatchsize
190
  inputs.forceversion = args.forceversion
191
  inputs.gpulayers = args.gpulayers
192
+ inputs.linear_rope = args.linearrope
193
  clblastids = 0
194
  if args.useclblast:
195
  clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
196
  inputs.clblast_info = clblastids
197
  inputs.cublas_info = 0
198
+ if (args.usecublas and "0" in args.usecublas):
199
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
200
+ elif (args.usecublas and "1" in args.usecublas):
201
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
202
  elif (args.usecublas and "2" in args.usecublas):
203
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
204
  inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
205
  inputs.debugmode = args.debugmode
206
  banned_tokens = args.bantokens
 
242
  for i, sampler in enumerate(sampler_order):
243
  inputs.sampler_order[i] = sampler
244
  inputs.sampler_len = len(sampler_order)
245
+ if inputs.sampler_len>0 and (inputs.sampler_order[0]!=6 or inputs.sampler_order[inputs.sampler_len-1]!=5):
246
+ print("\n(Warning!!! Poor sampler_order detected! You will have reduced quality. Recommended values are [6,0,1,3,4,2,5])")
247
  except TypeError as e:
248
  print("ERROR: sampler_order must be a list of integers: " + str(e))
249
  inputs.seed = seed
 
275
  maxhordelen = 256
276
  modelbusy = False
277
  defaultport = 5001
278
+ KcppVersion = "1.35"
279
  showdebug = True
280
 
281
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
 
455
  elif self.path.endswith(('/api/extra/version')):
456
  response_body = (json.dumps({"result":"KoboldCpp","version":KcppVersion}).encode())
457
 
458
+ elif self.path.endswith(('/api/extra/perf')):
459
+ lastp = handle.get_last_process_time()
460
+ laste = handle.get_last_eval_time()
461
+ response_body = (json.dumps({"last_process":lastp,"last_eval":laste}).encode())
462
+
463
  if response_body is None:
464
  self.send_response(404)
465
  self.end_headers()
 
537
  newprompt = fullprompt
538
 
539
  gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag))
 
540
  try:
541
  self.send_response(200)
542
  self.end_headers()
 
616
 
617
  # note: customtkinter-5.2.0
618
  def show_new_gui():
 
619
  from tkinter.filedialog import askopenfilename
620
  from tkinter.filedialog import asksaveasfile
621
 
622
  # if args received, launch
623
  if len(sys.argv) != 1:
624
+ import tkinter as tk
625
+ root = tk.Tk() #we dont want the useless window to be visible, but we want it in taskbar
626
  root.attributes("-alpha", 0)
627
  args.model_param = askopenfilename(title="Select ggml model .bin files")
628
  root.destroy()
 
632
  sys.exit(2)
633
  return
634
 
635
+ import customtkinter as ctk
636
+
637
  nextstate = 0 #0=exit, 1=launch, 2=oldgui
638
  windowwidth = 520
639
  windowheight = 500
 
775
  quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 4, 50)
776
  quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3)
777
  quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly")
778
+ CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly")
779
  quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 5)
780
 
 
781
  def changerunmode(a,b,c):
782
  index = runopts_var.get()
783
  if index == "Use CLBlast" or index == "Use CuBLAS":
784
  gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
 
785
  quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
786
+ if index == "Use CLBlast":
787
+ gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
788
+ quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
789
+ elif index == "Use CuBLAS":
790
+ CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
791
+ CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
792
  else:
793
  gpu_selector_label.grid_forget()
794
  gpu_selector_box.grid_forget()
795
+ CUDA_gpu_selector_box.grid_forget()
796
  quick_gpu_selector_label.grid_forget()
797
  quick_gpu_selector_box.grid_forget()
798
+ CUDA_quick_gpu_selector_box.grid_forget()
799
 
800
  if index == "Use CuBLAS":
801
  lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
 
846
  gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 4, 50)
847
  gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3)
848
  gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly")
849
+ CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3", "All"], width=60, variable=gpu_choice_var, state="readonly")
850
  lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 5)
851
 
852
  # presets selector
 
947
  root.destroy()
948
  pass
949
 
950
+ def export_vars():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
951
  args.threads = int(threads_var.get())
952
 
953
  args.usemlock = usemlock.get() == 1
 
960
  args.smartcontext = smartcontext.get()==1
961
  args.unbantokens = unbantokens.get()==1
962
 
963
+ gpuchoiceidx = 0
964
+ if gpu_choice_var.get()!="All":
965
+ gpuchoiceidx = int(gpu_choice_var.get())-1
966
  if runopts_var.get() == runopts[1]:
967
  args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx]
968
  if runopts_var.get() == runopts[2]:
969
+ if gpu_choice_var.get()=="All":
970
+ args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"]
971
+ else:
972
+ args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)]
973
  if gpulayers_var.get():
974
  args.gpulayers = int(gpulayers_var.get())
975
  if runopts_var.get()==runopts[3]:
 
980
  args.noavx2 = True
981
  args.noblas = True
982
  args.nommap = True
 
 
 
983
 
984
  args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
985
 
 
997
 
998
  args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get()]
999
 
1000
+ def import_vars(dict):
1001
+ threads_var.set(dict["threads"])
1002
+ usemlock.set(1 if dict["usemlock"] else 0)
1003
+ debugmode.set(1 if dict["debugmode"] else 0)
1004
+ launchbrowser.set(1 if dict["launch"] else 0)
1005
+ highpriority.set(1 if dict["highpriority"] else 0)
1006
+ disablemmap.set(1 if dict["nommap"] else 0)
1007
+ psutil.set(1 if dict["psutil_set_threads"] else 0)
1008
+ stream.set(1 if dict["stream"] else 0)
1009
+ smartcontext.set(1 if dict["smartcontext"] else 0)
1010
+ unbantokens.set(1 if dict["unbantokens"] else 0)
1011
+ runopts_var.set(runopts[0])
1012
+ if dict["useclblast"]:
1013
+ runopts_var.set(runopts[1])
1014
+ gpu_choice_var.set(str(["0 0", "1 0", "0 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1))
1015
+ elif dict["usecublas"]:
1016
+ runopts_var.set(runopts[2])
1017
+ if len(dict["usecublas"])==1:
1018
+ lowvram_var.set(1 if dict["usecublas"][0]=="lowvram" else 0)
1019
+ else:
1020
+ lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0)
1021
+ gpu_choice_var.set("1")
1022
+ for g in range(3):
1023
+ if str(g) in dict["usecublas"]:
1024
+ gpu_choice_var.set(str(g+1))
1025
+ break
1026
+ if dict["gpulayers"]:
1027
+ gpulayers_var.set(dict["gpulayers"])
1028
+
1029
+ if dict["noblas"] and dict["noavx2"]:
1030
+ runopts_var.set(runopts[5])
1031
+ elif dict["noavx2"]:
1032
+ runopts_var.set(runopts[5])
1033
+ elif dict["noblas"]:
1034
+ runopts_var.set(runopts[3])
1035
+ if dict["blasthreads"]:
1036
+ blas_threads_var.set(str(dict["blasthreads"]))
1037
+ else:
1038
+ blas_threads_var.set("")
1039
+
1040
+ if dict["contextsize"]:
1041
+ context_var.set(contextsize_text.index(str(dict["contextsize"])))
1042
+ if dict["blasbatchsize"]:
1043
+ blas_size_var.set(blasbatchsize_values.index(str(dict["blasbatchsize"])))
1044
+ if dict["forceversion"]:
1045
+ version_var.set(str(dict["forceversion"]))
1046
+
1047
+ if dict["mirostat"] and len(dict["mirostat"])>1:
1048
+ usemirostat.set(0 if str(dict["mirostat"][0])=="0" else 1)
1049
+ mirostat_var.set(str(dict["mirostat"][0]))
1050
+ mirostat_tau.set(str(dict["mirostat"][1]))
1051
+ mirostat_eta.set(str(dict["mirostat"][2]))
1052
+
1053
+ if dict["model_param"]:
1054
+ model_var.set(dict["model_param"])
1055
+
1056
+ if dict["lora"]:
1057
+ if len(dict["lora"]) > 1:
1058
+ lora_var.set(dict["lora"][0])
1059
+ lora_base_var.set(dict["lora"][1])
1060
+ else:
1061
+ lora_var.set(dict["lora"][0])
1062
+
1063
+ if dict["port_param"]:
1064
+ port_var.set(dict["port_param"])
1065
+
1066
+ if dict["host"]:
1067
+ host_var.set(dict["host"])
1068
+
1069
+ if dict["hordeconfig"] and len(dict["hordeconfig"]) > 1:
1070
+ horde_name_var.set(dict["hordeconfig"][0])
1071
+ horde_gen_var.set(dict["hordeconfig"][1])
1072
+ horde_context_var.set(dict["hordeconfig"][2])
1073
+
1074
+ def save_config():
1075
+ file_type = [("KoboldCpp Settings", "*.kcpps")]
1076
+ filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
1077
+ if filename == None: return
1078
+ export_vars()
1079
+ file = open(str(filename.name), 'a')
1080
+ file.write(json.dumps(args.__dict__))
1081
+ file.close()
1082
+ pass
1083
+
1084
+ def load_config():
1085
+ file_type = [("KoboldCpp Settings", "*.kcpps")]
1086
+ filename = askopenfilename(filetypes=file_type, defaultextension=file_type)
1087
+ if not filename or filename=="":
1088
+ return
1089
+ with open(filename, 'r') as f:
1090
+ dict = json.load(f)
1091
+ import_vars(dict)
1092
+ pass
1093
+
1094
+ ctk.CTkButton(tabs , text = "Launch", fg_color="#2f8d3c", command = guilaunch, width=80, height = 35 ).grid(row=1,column=1, stick="se", padx= 25, pady=5)
1095
+
1096
+ ctk.CTkButton(tabs , text = "Save", fg_color="#084a66", command = save_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 5, pady=5)
1097
+ ctk.CTkButton(tabs , text = "Load", fg_color="#084a66", command = load_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 70, pady=5)
1098
+
1099
+ ctk.CTkButton(tabs , text = "Old GUI", fg_color="#084a66", command = switch_old_gui, width=100, height = 35 ).grid(row=1,column=0, stick="sw", padx= 5, pady=5)
1100
+ # runs main loop until closed or launch clicked
1101
+ root.mainloop()
1102
+
1103
+ if nextstate==0:
1104
+ print("Exiting by user request.")
1105
+ time.sleep(2)
1106
+ sys.exit()
1107
+ elif nextstate==2:
1108
+ time.sleep(0.1)
1109
+ show_old_gui()
1110
+ else:
1111
+ # processing vars
1112
+ export_vars()
1113
+
1114
  if not args.model_param:
1115
  print("\nNo ggml model file was selected. Exiting.")
1116
  time.sleep(2)
 
1431
  parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
1432
  parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048)
1433
  parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
1434
+ parser.add_argument("--linearrope", help="If set, uses linear RoPE scaling. Otherwise, uses NTK-Aware scaling.", action='store_true')
1435
  parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
1436
  parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
1437
  parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true')
 
1447
  compatgroup = parser.add_mutually_exclusive_group()
1448
  compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
1449
  compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
1450
+ compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2'])
1451
  parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
1452
  args = parser.parse_args()
1453
  main(args)
llama-util.h CHANGED
@@ -175,13 +175,13 @@ struct llama_mmap {
175
  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
176
  size = file->size;
177
  int fd = fileno(file->fp);
178
- int flags = MAP_SHARED;
179
  // prefetch/readahead impairs performance on NUMA systems
180
  if (numa) { prefetch = 0; }
181
  #ifdef __linux__
182
  if (prefetch) { flags |= MAP_POPULATE; }
183
  #endif
184
- addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
185
  if (addr == MAP_FAILED) {
186
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
187
  }
@@ -223,7 +223,7 @@ struct llama_mmap {
223
  throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
224
  }
225
 
226
- addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
227
  error = GetLastError();
228
  CloseHandle(hMapping);
229
 
 
175
  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
176
  size = file->size;
177
  int fd = fileno(file->fp);
178
+ int flags = MAP_PRIVATE;
179
  // prefetch/readahead impairs performance on NUMA systems
180
  if (numa) { prefetch = 0; }
181
  #ifdef __linux__
182
  if (prefetch) { flags |= MAP_POPULATE; }
183
  #endif
184
+ addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
185
  if (addr == MAP_FAILED) {
186
  throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
187
  }
 
223
  throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
224
  }
225
 
226
+ addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
227
  error = GetLastError();
228
  CloseHandle(hMapping);
229
 
llama.cpp CHANGED
@@ -20,6 +20,9 @@
20
  #ifdef GGML_USE_METAL
21
  #include "ggml-metal.h"
22
  #endif
 
 
 
23
  #ifdef GGML_USE_K_QUANTS
24
  #ifndef QK_K
25
  #ifdef GGML_QKK_64
@@ -80,6 +83,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
80
  (void) tensor;
81
  }
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
84
  {
85
  static std::map<e_model, size_t> k_sizes = {
@@ -322,6 +344,9 @@ struct llama_context {
322
  // input embedding (1-dimensional array: [n_embd])
323
  std::vector<float> embedding;
324
 
 
 
 
325
  // memory buffers used to evaluate the model
326
  // TODO: move in llama_state
327
  llama_ctx_buffer buf_compute;
@@ -331,6 +356,10 @@ struct llama_context {
331
  ggml_metal_context * ctx_metal = NULL;
332
  #endif
333
 
 
 
 
 
334
  int buf_last = 0;
335
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
336
 
@@ -759,7 +788,6 @@ struct llama_model_loader {
759
 
760
  };
761
 
762
-
763
  //
764
  // kv cache
765
  //
@@ -850,7 +878,7 @@ bool llama_mlock_supported() {
850
  return llama_mlock::SUPPORTED;
851
  }
852
 
853
- void llama_init_backend(bool numa) {
854
  ggml_time_init();
855
 
856
  // needed to initialize f16 tables
@@ -863,6 +891,16 @@ void llama_init_backend(bool numa) {
863
  if (numa) {
864
  ggml_numa_init();
865
  }
 
 
 
 
 
 
 
 
 
 
866
  }
867
 
868
  int64_t llama_time_us() {
@@ -1265,16 +1303,16 @@ static bool llama_eval_internal(
1265
  llama_context & lctx,
1266
  const llama_token * tokens,
1267
  const float * embd,
1268
- const int n_tokens,
1269
- const int n_past,
1270
- const int n_threads,
1271
  const char * cgraph_fname) {
1272
 
1273
- // // enforce that the first token is BOS
1274
- // if (n_past == 0 && tokens[0] != llama_token_bos()) {
1275
- // fprintf(stderr, "%s: first token must be BOS\n", __func__);
1276
- // return false;
1277
- // }
1278
 
1279
  const int64_t t_start_us = ggml_time_us();
1280
 
@@ -1306,20 +1344,26 @@ static bool llama_eval_internal(
1306
 
1307
  struct ggml_context * ctx0 = ggml_init(params);
1308
 
 
 
1309
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1310
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1311
- ggml_cgraph gf = {};
1312
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1313
 
1314
  struct ggml_tensor * cur;
1315
  struct ggml_tensor * inpL;
1316
 
1317
  if (tokens) {
1318
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1319
- ggml_set_name(embd, "embd");
1320
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1321
- inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
 
1322
  } else {
 
 
 
 
1323
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1324
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1325
  }
@@ -1337,18 +1381,20 @@ static bool llama_eval_internal(
1337
  offload_func_t offload_func_v = llama_nop;
1338
 
1339
  #ifdef GGML_USE_CUBLAS
1340
- if (n_gpu_layers > n_layer) {
1341
- offload_func_nr = ggml_cuda_assign_buffers;
1342
- }
1343
- if (n_gpu_layers > n_layer + 1) {
1344
- offload_func_v = ggml_cuda_assign_buffers;
1345
- }
1346
- if (n_gpu_layers > n_layer + 2) {
1347
- offload_func_kq = ggml_cuda_assign_buffers;
1348
- }
1349
  #endif // GGML_USE_CUBLAS
1350
 
1351
  for (int il = 0; il < n_layer; ++il) {
 
 
1352
  offload_func_t offload_func = llama_nop;
1353
 
1354
  #ifdef GGML_USE_CUBLAS
@@ -1555,7 +1601,6 @@ static bool llama_eval_internal(
1555
 
1556
  // input for next layer
1557
  inpL = cur;
1558
-
1559
  }
1560
 
1561
  lctx.use_buf(ctx0, 0);
@@ -1563,7 +1608,6 @@ static bool llama_eval_internal(
1563
  // used at the end to optionally extract the embeddings
1564
  struct ggml_tensor * embeddings = NULL;
1565
 
1566
-
1567
  // norm
1568
  {
1569
  cur = ggml_rms_norm(ctx0, inpL);
@@ -1578,7 +1622,6 @@ static bool llama_eval_internal(
1578
  embeddings = cur;
1579
  }
1580
 
1581
-
1582
  // lm_head
1583
  cur = ggml_mul_mat(ctx0, model.output, cur);
1584
  ggml_set_name(cur, "result_output");
@@ -1591,8 +1634,13 @@ static bool llama_eval_internal(
1591
  // run the computation
1592
  ggml_build_forward_expand(&gf, cur);
1593
 
 
 
 
 
1594
  #ifdef GGML_USE_METAL
1595
  if (lctx.ctx_metal && N == 1) {
 
1596
  ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1597
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1598
  } else {
@@ -1612,12 +1660,21 @@ static bool llama_eval_internal(
1612
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1613
  }
1614
 
1615
- ggml_graph_compute(ctx0, &gf);
1616
  }
1617
  #else
1618
- ggml_graph_compute(ctx0, &gf);
1619
  #endif
1620
 
 
 
 
 
 
 
 
 
 
1621
  if (cgraph_fname) {
1622
  ggml_graph_export(&gf, cgraph_fname);
1623
  }
@@ -1633,23 +1690,17 @@ static bool llama_eval_internal(
1633
  // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1634
  //}
1635
 
1636
- //embd_w.resize(n_vocab*N);
1637
- //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1638
-
1639
- // update kv token count
1640
- lctx.kv_self.n = n_past + N;
1641
-
1642
  // extract logits
1643
  {
1644
  auto & logits_out = lctx.logits;
1645
 
1646
  if (lctx.logits_all) {
1647
  logits_out.resize(n_vocab * N);
1648
- memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
1649
  } else {
1650
  // return result for just the last token
1651
  logits_out.resize(n_vocab);
1652
- memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1653
  }
1654
  }
1655
 
@@ -2118,6 +2169,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
2118
  }
2119
  }
2120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2121
 
2122
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
2123
  assert(ctx);
@@ -2405,15 +2512,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2405
  } else {
2406
  new_type = quantized_type;
2407
  #ifdef GGML_USE_K_QUANTS
 
2408
  if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2409
  quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2410
  int nx = tensor.ne.at(0);
2411
  int ny = tensor.ne.at(1);
2412
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2413
- fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2414
- fprintf(stderr, "Verify before using\n");
2415
- fprintf(stderr, "========================================================================================\n\n");
2416
- // throw std::runtime_error("Unsupported tensor size encountered\n");
2417
  }
2418
  }
2419
  if (tensor.name == "output.weight") {
@@ -2441,6 +2547,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2441
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2442
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2443
  }
 
 
 
 
 
 
 
 
 
 
 
2444
  #endif
2445
 
2446
  float * f32_data;
@@ -2575,8 +2692,8 @@ void llama_free_model(struct llama_model * model) {
2575
  }
2576
 
2577
  struct llama_context * llama_new_context_with_model(
2578
- struct llama_model * model,
2579
- struct llama_context_params params) {
2580
 
2581
  if (!model) {
2582
  return nullptr;
@@ -2646,7 +2763,7 @@ struct llama_context * llama_new_context_with_model(
2646
  #ifdef GGML_USE_METAL
2647
  if (params.n_gpu_layers > 0) {
2648
  // this allocates all Metal resources and memory buffers
2649
- ctx->ctx_metal = ggml_metal_init();
2650
 
2651
  void * data_ptr = NULL;
2652
  size_t data_size = 0;
@@ -2681,6 +2798,18 @@ struct llama_context * llama_new_context_with_model(
2681
  }
2682
  #endif
2683
 
 
 
 
 
 
 
 
 
 
 
 
 
2684
  return ctx;
2685
  }
2686
 
@@ -2803,6 +2932,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2803
  // read tensors and apply
2804
  bool warned = false;
2805
  int n_tensors = 0;
 
 
 
2806
  while (true) {
2807
  int32_t n_dims;
2808
  int32_t length;
@@ -2967,8 +3099,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2967
  }
2968
 
2969
  struct ggml_cgraph gf = ggml_build_forward(r);
2970
- gf.n_threads = n_threads;
2971
- ggml_graph_compute(lora_ctx, &gf);
2972
 
2973
  // we won't need these tensors again, reset the context to save memory
2974
  ggml_free(lora_ctx);
@@ -3121,7 +3253,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3121
 
3122
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3123
  ggml_cgraph gf{};
3124
- gf.n_threads = 1;
3125
 
3126
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3127
  kout3d->data = out;
@@ -3141,7 +3272,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3141
 
3142
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
3143
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
3144
- ggml_graph_compute(cpy_ctx, &gf);
3145
 
3146
  ggml_free(cpy_ctx);
3147
  }
@@ -3227,7 +3358,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3227
 
3228
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3229
  ggml_cgraph gf{};
3230
- gf.n_threads = 1;
3231
 
3232
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3233
  kin3d->data = (void *) inp;
@@ -3247,7 +3377,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3247
 
3248
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
3249
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
3250
- ggml_graph_compute(cpy_ctx, &gf);
3251
 
3252
  ggml_free(cpy_ctx);
3253
  }
 
20
  #ifdef GGML_USE_METAL
21
  #include "ggml-metal.h"
22
  #endif
23
+ #ifdef GGML_USE_MPI
24
+ #include "ggml-mpi.h"
25
+ #endif
26
  #ifdef GGML_USE_K_QUANTS
27
  #ifndef QK_K
28
  #ifdef GGML_QKK_64
 
83
  (void) tensor;
84
  }
85
 
86
+ //
87
+ // ggml helpers
88
+ //
89
+
90
+ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
91
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
92
+
93
+ if (plan.work_size > 0) {
94
+ buf.resize(plan.work_size);
95
+ plan.work_data = buf.data();
96
+ }
97
+
98
+ ggml_graph_compute(graph, &plan);
99
+ }
100
+
101
+ //
102
+ // memory sizes
103
+ //
104
+
105
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
106
  {
107
  static std::map<e_model, size_t> k_sizes = {
 
344
  // input embedding (1-dimensional array: [n_embd])
345
  std::vector<float> embedding;
346
 
347
+ // reusable buffer for `struct ggml_graph_plan.work_data`
348
+ std::vector<uint8_t> work_buffer;
349
+
350
  // memory buffers used to evaluate the model
351
  // TODO: move in llama_state
352
  llama_ctx_buffer buf_compute;
 
356
  ggml_metal_context * ctx_metal = NULL;
357
  #endif
358
 
359
+ #ifdef GGML_USE_MPI
360
+ ggml_mpi_context * ctx_mpi = NULL;
361
+ #endif
362
+
363
  int buf_last = 0;
364
  size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
365
 
 
788
 
789
  };
790
 
 
791
  //
792
  // kv cache
793
  //
 
878
  return llama_mlock::SUPPORTED;
879
  }
880
 
881
+ void llama_backend_init(bool numa) {
882
  ggml_time_init();
883
 
884
  // needed to initialize f16 tables
 
891
  if (numa) {
892
  ggml_numa_init();
893
  }
894
+
895
+ #ifdef GGML_USE_MPI
896
+ ggml_mpi_backend_init();
897
+ #endif
898
+ }
899
+
900
+ void llama_backend_free() {
901
+ #ifdef GGML_USE_MPI
902
+ ggml_mpi_backend_free();
903
+ #endif
904
  }
905
 
906
  int64_t llama_time_us() {
 
1303
  llama_context & lctx,
1304
  const llama_token * tokens,
1305
  const float * embd,
1306
+ int n_tokens,
1307
+ int n_past,
1308
+ int n_threads,
1309
  const char * cgraph_fname) {
1310
 
1311
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1312
+
1313
+ #ifdef GGML_USE_MPI
1314
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1315
+ #endif
1316
 
1317
  const int64_t t_start_us = ggml_time_us();
1318
 
 
1344
 
1345
  struct ggml_context * ctx0 = ggml_init(params);
1346
 
1347
+ ggml_cgraph gf = {};
1348
+
1349
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1350
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1351
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
1352
 
1353
  struct ggml_tensor * cur;
1354
  struct ggml_tensor * inpL;
1355
 
1356
  if (tokens) {
1357
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1358
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1359
+ ggml_set_name(inp_tokens, "inp_tokens");
1360
+
1361
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
1362
  } else {
1363
+ #ifdef GGML_USE_MPI
1364
+ GGML_ASSERT(false && "not implemented");
1365
+ #endif
1366
+
1367
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1368
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1369
  }
 
1381
  offload_func_t offload_func_v = llama_nop;
1382
 
1383
  #ifdef GGML_USE_CUBLAS
1384
+ if (n_gpu_layers > n_layer) {
1385
+ offload_func_nr = ggml_cuda_assign_buffers;
1386
+ }
1387
+ if (n_gpu_layers > n_layer + 1) {
1388
+ offload_func_v = ggml_cuda_assign_buffers;
1389
+ }
1390
+ if (n_gpu_layers > n_layer + 2) {
1391
+ offload_func_kq = ggml_cuda_assign_buffers;
1392
+ }
1393
  #endif // GGML_USE_CUBLAS
1394
 
1395
  for (int il = 0; il < n_layer; ++il) {
1396
+ ggml_format_name(inpL, "layer_inp_%d", il);
1397
+
1398
  offload_func_t offload_func = llama_nop;
1399
 
1400
  #ifdef GGML_USE_CUBLAS
 
1601
 
1602
  // input for next layer
1603
  inpL = cur;
 
1604
  }
1605
 
1606
  lctx.use_buf(ctx0, 0);
 
1608
  // used at the end to optionally extract the embeddings
1609
  struct ggml_tensor * embeddings = NULL;
1610
 
 
1611
  // norm
1612
  {
1613
  cur = ggml_rms_norm(ctx0, inpL);
 
1622
  embeddings = cur;
1623
  }
1624
 
 
1625
  // lm_head
1626
  cur = ggml_mul_mat(ctx0, model.output, cur);
1627
  ggml_set_name(cur, "result_output");
 
1634
  // run the computation
1635
  ggml_build_forward_expand(&gf, cur);
1636
 
1637
+ #if GGML_USE_MPI
1638
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
1639
+ #endif
1640
+
1641
  #ifdef GGML_USE_METAL
1642
  if (lctx.ctx_metal && N == 1) {
1643
+ ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1644
  ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1645
  ggml_metal_get_tensor (lctx.ctx_metal, cur);
1646
  } else {
 
1660
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1661
  }
1662
 
1663
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1664
  }
1665
  #else
1666
+ ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1667
  #endif
1668
 
1669
+ #if GGML_USE_MPI
1670
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1671
+ #endif
1672
+
1673
+ // update kv token count
1674
+ lctx.kv_self.n = n_past + N;
1675
+
1676
+ struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1677
+
1678
  if (cgraph_fname) {
1679
  ggml_graph_export(&gf, cgraph_fname);
1680
  }
 
1690
  // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1691
  //}
1692
 
 
 
 
 
 
 
1693
  // extract logits
1694
  {
1695
  auto & logits_out = lctx.logits;
1696
 
1697
  if (lctx.logits_all) {
1698
  logits_out.resize(n_vocab * N);
1699
+ memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
1700
  } else {
1701
  // return result for just the last token
1702
  logits_out.resize(n_vocab);
1703
+ memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1704
  }
1705
  }
1706
 
 
2169
  }
2170
  }
2171
 
2172
+ static void llama_log_softmax(float * array, size_t size) {
2173
+ float max_l = *std::max_element(array, array + size);
2174
+ float sum = 0.f;
2175
+ for (size_t i = 0; i < size; ++i) {
2176
+ float p = expf(array[i] - max_l);
2177
+ sum += p;
2178
+ array[i] = p;
2179
+ }
2180
+
2181
+ for (size_t i = 0; i < size; ++i) {
2182
+ array[i] = logf(array[i] / sum);
2183
+ }
2184
+ }
2185
+
2186
+ void llama_sample_classifier_free_guidance(
2187
+ struct llama_context * ctx,
2188
+ llama_token_data_array * candidates,
2189
+ struct llama_context * guidance_ctx,
2190
+ float scale,
2191
+ float smooth_factor) {
2192
+ int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
2193
+
2194
+ assert(ctx);
2195
+ auto n_vocab = llama_n_vocab(ctx);
2196
+ assert(n_vocab == (int)candidates->size);
2197
+ assert(!candidates->sorted);
2198
+
2199
+ std::vector<float> logits_base;
2200
+ logits_base.reserve(candidates->size);
2201
+ for (size_t i = 0; i < candidates->size; ++i) {
2202
+ logits_base.push_back(candidates->data[i].logit);
2203
+ }
2204
+ llama_log_softmax(logits_base.data(), candidates->size);
2205
+
2206
+ float* logits_guidance = llama_get_logits(guidance_ctx);
2207
+ llama_log_softmax(logits_guidance, n_vocab);
2208
+
2209
+ for (int i = 0; i < n_vocab; ++i) {
2210
+ float logit_guidance = logits_guidance[i];
2211
+ float logit_base = logits_base[i];
2212
+ logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
2213
+ }
2214
+
2215
+ llama_log_softmax(logits_guidance, n_vocab);
2216
+
2217
+ for (int i = 0; i < n_vocab; ++i) {
2218
+ float logit_base = logits_base[i];
2219
+ float logit_guidance = logits_guidance[i];
2220
+
2221
+ candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
2222
+ }
2223
+
2224
+ if (ctx) {
2225
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2226
+ }
2227
+ }
2228
 
2229
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
2230
  assert(ctx);
 
2512
  } else {
2513
  new_type = quantized_type;
2514
  #ifdef GGML_USE_K_QUANTS
2515
+ bool convert_incompatible_tensor = false;
2516
  if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2517
  quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2518
  int nx = tensor.ne.at(0);
2519
  int ny = tensor.ne.at(1);
2520
  if (nx % QK_K != 0 || ny % QK_K != 0) {
2521
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2522
+ convert_incompatible_tensor = true;
 
 
2523
  }
2524
  }
2525
  if (tensor.name == "output.weight") {
 
2547
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2548
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2549
  }
2550
+ if (convert_incompatible_tensor) {
2551
+ if (tensor.name == "output.weight") {
2552
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
2553
+ fprintf(stderr, "F16 will be used for this tensor instead.\n");
2554
+ } else if (tensor.name == "tok_embeddings.weight") {
2555
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
2556
+ fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
2557
+ } else {
2558
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2559
+ }
2560
+ }
2561
  #endif
2562
 
2563
  float * f32_data;
 
2692
  }
2693
 
2694
  struct llama_context * llama_new_context_with_model(
2695
+ struct llama_model * model,
2696
+ struct llama_context_params params) {
2697
 
2698
  if (!model) {
2699
  return nullptr;
 
2763
  #ifdef GGML_USE_METAL
2764
  if (params.n_gpu_layers > 0) {
2765
  // this allocates all Metal resources and memory buffers
2766
+ ctx->ctx_metal = ggml_metal_init(1);
2767
 
2768
  void * data_ptr = NULL;
2769
  size_t data_size = 0;
 
2798
  }
2799
  #endif
2800
 
2801
+ #ifdef GGML_USE_MPI
2802
+ ctx->ctx_mpi = ggml_mpi_init();
2803
+
2804
+ if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
2805
+ // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
2806
+ const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
2807
+ while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
2808
+ llama_backend_free();
2809
+ exit(1);
2810
+ }
2811
+ #endif
2812
+
2813
  return ctx;
2814
  }
2815
 
 
2932
  // read tensors and apply
2933
  bool warned = false;
2934
  int n_tensors = 0;
2935
+
2936
+ std::vector<uint8_t> work_buffer;
2937
+
2938
  while (true) {
2939
  int32_t n_dims;
2940
  int32_t length;
 
3099
  }
3100
 
3101
  struct ggml_cgraph gf = ggml_build_forward(r);
3102
+
3103
+ ggml_graph_compute_helper(work_buffer, &gf, n_threads);
3104
 
3105
  // we won't need these tensors again, reset the context to save memory
3106
  ggml_free(lora_ctx);
 
3253
 
3254
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3255
  ggml_cgraph gf{};
 
3256
 
3257
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3258
  kout3d->data = out;
 
3272
 
3273
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
3274
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
3275
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3276
 
3277
  ggml_free(cpy_ctx);
3278
  }
 
3358
 
3359
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
3360
  ggml_cgraph gf{};
 
3361
 
3362
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3363
  kin3d->data = (void *) inp;
 
3377
 
3378
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
3379
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
3380
+ ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3381
 
3382
  ggml_free(cpy_ctx);
3383
  }
llama.h CHANGED
@@ -158,7 +158,9 @@ extern "C" {
158
  // Initialize the llama + ggml backend
159
  // If numa is true, use NUMA optimizations
160
  // Call once at the start of the program
161
- LLAMA_API void llama_init_backend(bool numa);
 
 
162
 
163
  LLAMA_API int64_t llama_time_us();
164
 
@@ -307,6 +309,18 @@ extern "C" {
307
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
308
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
309
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
311
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
312
 
 
158
  // Initialize the llama + ggml backend
159
  // If numa is true, use NUMA optimizations
160
  // Call once at the start of the program
161
+ LLAMA_API void llama_backend_init(bool numa);
162
+ // Call once at the end of the program - currently only used for MPI
163
+ LLAMA_API void llama_backend_free();
164
 
165
  LLAMA_API int64_t llama_time_us();
166
 
 
309
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
310
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
311
 
312
+ /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
313
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
+ /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
+ /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
+ /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
+ LLAMA_API void llama_sample_classifier_free_guidance(
318
+ struct llama_context * ctx,
319
+ llama_token_data_array * candidates,
320
+ struct llama_context * guidance_ctx,
321
+ float scale,
322
+ float smooth_factor);
323
+
324
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
325
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
326
 
otherarch/gpt2_v3.cpp CHANGED
@@ -447,7 +447,6 @@ bool gpt2_eval(
447
 
448
  struct ggml_context * ctx0 = ggml_init(params);
449
  struct ggml_cgraph gf = {};
450
- gf.n_threads = n_threads;
451
 
452
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
453
  memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@@ -708,7 +707,7 @@ bool gpt2_eval(
708
 
709
  // run the computation
710
  ggml_build_forward_expand(&gf, inpL);
711
- ggml_graph_compute (ctx0, &gf);
712
 
713
  //if (n_past%100 == 0) {
714
  // ggml_graph_print (&gf);
 
447
 
448
  struct ggml_context * ctx0 = ggml_init(params);
449
  struct ggml_cgraph gf = {};
 
450
 
451
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
452
  memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
 
707
 
708
  // run the computation
709
  ggml_build_forward_expand(&gf, inpL);
710
+ kcpp_graph_compute_helper(&gf, n_threads);
711
 
712
  //if (n_past%100 == 0) {
713
  // ggml_graph_print (&gf);
otherarch/gptj_v3.cpp CHANGED
@@ -445,7 +445,6 @@ bool gptj_eval(
445
 
446
  struct ggml_context * ctx0 = ggml_init(params);
447
  struct ggml_cgraph gf = {};
448
- gf.n_threads = n_threads;
449
 
450
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
451
  memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@@ -620,7 +619,7 @@ bool gptj_eval(
620
 
621
  // run the computation
622
  ggml_build_forward_expand(&gf, inpL);
623
- ggml_graph_compute (ctx0, &gf);
624
 
625
  //if (n_past%100 == 0) {
626
  // ggml_graph_print (&gf);
 
445
 
446
  struct ggml_context * ctx0 = ggml_init(params);
447
  struct ggml_cgraph gf = {};
 
448
 
449
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
450
  memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
 
619
 
620
  // run the computation
621
  ggml_build_forward_expand(&gf, inpL);
622
+ kcpp_graph_compute_helper(&gf, n_threads);
623
 
624
  //if (n_past%100 == 0) {
625
  // ggml_graph_print (&gf);
otherarch/mpt_v3.cpp CHANGED
@@ -383,7 +383,6 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
383
 
384
  struct ggml_context * ctx0 = ggml_init(params);
385
  struct ggml_cgraph gf = {};
386
- gf.n_threads = n_threads;
387
 
388
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
389
  memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
@@ -543,7 +542,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
543
 
544
  // run the computation
545
  ggml_build_forward_expand(&gf, inpL);
546
- ggml_graph_compute(ctx0, &gf);
547
 
548
  // std::cout << "Qcur" << std::endl;
549
  // print_tensor(Qcur);
 
383
 
384
  struct ggml_context * ctx0 = ggml_init(params);
385
  struct ggml_cgraph gf = {};
 
386
 
387
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
388
  memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
 
542
 
543
  // run the computation
544
  ggml_build_forward_expand(&gf, inpL);
545
+ kcpp_graph_compute_helper(&gf, n_threads);
546
 
547
  // std::cout << "Qcur" << std::endl;
548
  // print_tensor(Qcur);
otherarch/neox_v3.cpp CHANGED
@@ -461,7 +461,6 @@ bool gpt_neox_eval(
461
 
462
  struct ggml_context * ctx0 = ggml_init(params);
463
  struct ggml_cgraph gf = {};
464
- gf.n_threads = n_threads;
465
 
466
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
467
  memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@@ -639,7 +638,7 @@ bool gpt_neox_eval(
639
 
640
  // run the computation
641
  ggml_build_forward_expand(&gf, inpL);
642
- ggml_graph_compute (ctx0, &gf);
643
 
644
  //if (n_past%100 == 0) {
645
  // ggml_graph_print (&gf);
 
461
 
462
  struct ggml_context * ctx0 = ggml_init(params);
463
  struct ggml_cgraph gf = {};
 
464
 
465
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
466
  memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
 
638
 
639
  // run the computation
640
  ggml_build_forward_expand(&gf, inpL);
641
+ kcpp_graph_compute_helper(&gf, n_threads);
642
 
643
  //if (n_past%100 == 0) {
644
  // ggml_graph_print (&gf);
otherarch/rwkv_v3.cpp CHANGED
@@ -13,6 +13,8 @@
13
  #include "ggml-opencl.h"
14
  #endif
15
 
 
 
16
  #include <string>
17
  #include <vector>
18
  #include <cstring>
@@ -729,6 +731,7 @@ struct rwkv_context {
729
  float * logits_out = 0; //stores address of output logit buffer
730
 
731
  size_t gpu_layers;
 
732
  };
733
 
734
  // https://stackoverflow.com/a/6458689
@@ -1511,7 +1514,6 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
1511
  serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
1512
  serial_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
1513
  RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
1514
- serial_graph.cgraph->n_threads = n_threads;
1515
 
1516
  RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph(
1517
  serial_graph.ctx.ctx, instance->model,
@@ -1609,7 +1611,7 @@ void rwkv_get_outputs(const struct rwkv_context * ctx, float * state_out, float
1609
  }
1610
  }
1611
 
1612
- bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * state_in, float * state_out, float * logits_out) {
1613
  ctx->last_error = RWKV_ERROR_NONE;
1614
 
1615
  const struct rwkv_file_header & header = ctx->instance->model.header;
@@ -1628,13 +1630,13 @@ bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * st
1628
  ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs;
1629
  }
1630
 
1631
- ggml_graph_compute(ctx->serial_graph.ctx.ctx, ctx->serial_graph.cgraph.get());
1632
  rwkv_get_outputs(ctx, state_out, logits_out);
1633
 
1634
  return true;
1635
  }
1636
 
1637
- bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * sequence, const size_t sequence_len, const float * state_in, float * state_out, float * logits_out) {
1638
  ctx->last_error = RWKV_ERROR_NONE;
1639
 
1640
  const struct rwkv_file_header & header = ctx->instance->model.header;
@@ -1690,7 +1692,6 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * sequence, co
1690
  sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len);
1691
  sequence_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
1692
  RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph");
1693
- sequence_graph.cgraph->n_threads = 1;
1694
 
1695
  RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph(
1696
  sequence_graph.ctx.ctx, ctx->instance->model,
@@ -1717,7 +1718,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * sequence, co
1717
  ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs;
1718
  }
1719
 
1720
- ggml_graph_compute(ctx->sequence_graph.ctx.ctx, ctx->sequence_graph.cgraph.get());
1721
  rwkv_get_outputs(ctx, state_out, logits_out);
1722
  }
1723
 
 
13
  #include "ggml-opencl.h"
14
  #endif
15
 
16
+ #include "utils.h"
17
+
18
  #include <string>
19
  #include <vector>
20
  #include <cstring>
 
731
  float * logits_out = 0; //stores address of output logit buffer
732
 
733
  size_t gpu_layers;
734
+ std::vector<uint8_t> work_buffer;
735
  };
736
 
737
  // https://stackoverflow.com/a/6458689
 
1514
  serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
1515
  serial_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
1516
  RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
 
1517
 
1518
  RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph(
1519
  serial_graph.ctx.ctx, instance->model,
 
1611
  }
1612
  }
1613
 
1614
+ bool rwkv_eval(struct rwkv_context * ctx, const int n_threads, const uint32_t token, const float * state_in, float * state_out, float * logits_out) {
1615
  ctx->last_error = RWKV_ERROR_NONE;
1616
 
1617
  const struct rwkv_file_header & header = ctx->instance->model.header;
 
1630
  ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs;
1631
  }
1632
 
1633
+ kcpp_graph_compute_helper(ctx->serial_graph.cgraph.get(),n_threads);
1634
  rwkv_get_outputs(ctx, state_out, logits_out);
1635
 
1636
  return true;
1637
  }
1638
 
1639
+ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const uint32_t * sequence, const size_t sequence_len, const float * state_in, float * state_out, float * logits_out) {
1640
  ctx->last_error = RWKV_ERROR_NONE;
1641
 
1642
  const struct rwkv_file_header & header = ctx->instance->model.header;
 
1692
  sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len);
1693
  sequence_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
1694
  RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph");
 
1695
 
1696
  RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph(
1697
  sequence_graph.ctx.ctx, ctx->instance->model,
 
1718
  ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs;
1719
  }
1720
 
1721
+ kcpp_graph_compute_helper(ctx->sequence_graph.cgraph.get(),n_threads);
1722
  rwkv_get_outputs(ctx, state_out, logits_out);
1723
  }
1724
 
otherarch/rwkv_v3.h CHANGED
@@ -111,7 +111,7 @@ extern "C" {
111
  // - state_in: FP32 buffer of size rwkv_get_state_len(); or NULL, if this is a first pass.
112
  // - state_out: FP32 buffer of size rwkv_get_state_len(). This buffer will be written to if non-NULL.
113
  // - logits_out: FP32 buffer of size rwkv_get_logits_len(). This buffer will be written to if non-NULL.
114
- RWKV_API bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * state_in, float * state_out, float * logits_out);
115
 
116
  // Evaluates the model for a sequence of tokens.
117
  // Uses a faster algorithm than rwkv_eval if you do not need the state and logits for every token. Best used with batch sizes of 64 or so.
@@ -125,7 +125,7 @@ extern "C" {
125
  // - state_in: FP32 buffer of size rwkv_get_state_len(), or NULL if this is a first pass.
126
  // - state_out: FP32 buffer of size rwkv_get_state_len(). This buffer will be written to if non-NULL.
127
  // - logits_out: FP32 buffer of size rwkv_get_logits_len(). This buffer will be written to if non-NULL.
128
- RWKV_API bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * tokens, size_t sequence_len, const float * state_in, float * state_out, float * logits_out);
129
 
130
  // Returns the number of tokens in the given model's vocabulary.
131
  // Useful for telling 20B_tokenizer models (n_vocab = 50277) apart from World models (n_vocab = 65536).
 
111
  // - state_in: FP32 buffer of size rwkv_get_state_len(); or NULL, if this is a first pass.
112
  // - state_out: FP32 buffer of size rwkv_get_state_len(). This buffer will be written to if non-NULL.
113
  // - logits_out: FP32 buffer of size rwkv_get_logits_len(). This buffer will be written to if non-NULL.
114
+ RWKV_API bool rwkv_eval(struct rwkv_context *, const int n_threads, const uint32_t token, const float * state_in, float * state_out, float * logits_out);
115
 
116
  // Evaluates the model for a sequence of tokens.
117
  // Uses a faster algorithm than rwkv_eval if you do not need the state and logits for every token. Best used with batch sizes of 64 or so.
 
125
  // - state_in: FP32 buffer of size rwkv_get_state_len(), or NULL if this is a first pass.
126
  // - state_out: FP32 buffer of size rwkv_get_state_len(). This buffer will be written to if non-NULL.
127
  // - logits_out: FP32 buffer of size rwkv_get_logits_len(). This buffer will be written to if non-NULL.
128
+ RWKV_API bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const uint32_t * tokens, size_t sequence_len, const float * state_in, float * state_out, float * logits_out);
129
 
130
  // Returns the number of tokens in the given model's vocabulary.
131
  // Useful for telling 20B_tokenizer models (n_vocab = 50277) apart from World models (n_vocab = 65536).
otherarch/utils.cpp CHANGED
@@ -221,4 +221,16 @@ bool should_transpose_layer(std::string name)
221
  return true;
222
  }
223
  return false;
 
 
 
 
 
 
 
 
 
 
 
 
224
  }
 
221
  return true;
222
  }
223
  return false;
224
+ }
225
+
226
+ static std::vector<uint8_t> kcpp_compute_buf;
227
+ void kcpp_graph_compute_helper(ggml_cgraph *graph, int n_threads)
228
+ {
229
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
230
+ if (plan.work_size > 0)
231
+ {
232
+ kcpp_compute_buf.resize(plan.work_size);
233
+ plan.work_data = kcpp_compute_buf.data();
234
+ }
235
+ ggml_graph_compute(graph, &plan);
236
  }
otherarch/utils.h CHANGED
@@ -54,4 +54,6 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
54
 
55
 
56
 
57
- bool should_transpose_layer(std::string name);
 
 
 
54
 
55
 
56
 
57
+ bool should_transpose_layer(std::string name);
58
+
59
+ void kcpp_graph_compute_helper(ggml_cgraph * graph, int n_threads);
spm-headers/ggml.h CHANGED
@@ -65,7 +65,7 @@
65
  // ggml_set_f32(a, 3.0f);
66
  // ggml_set_f32(b, 4.0f);
67
  //
68
- // ggml_graph_compute(ctx0, &gf);
69
  //
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
  //
@@ -132,10 +132,10 @@
132
  // {
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
  //
135
- // // a[1, 2] = 1.0f;
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
  //
138
- // // a[2, 0] = 2.0f;
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
  //
141
  // ...
@@ -197,12 +197,18 @@
197
  #define GGML_MAX_NODES 4096
198
  #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
- #define GGML_MAX_OPT 4
201
  #define GGML_MAX_NAME 48
202
  #define GGML_DEFAULT_N_THREADS 4
203
 
204
  #define GGML_UNUSED(x) (void)(x)
205
 
 
 
 
 
 
 
206
  #define GGML_ASSERT(x) \
207
  do { \
208
  if (!(x)) { \
@@ -414,12 +420,7 @@ extern "C" {
414
  bool is_param;
415
 
416
  struct ggml_tensor * grad;
417
- struct ggml_tensor * src0;
418
- struct ggml_tensor * src1;
419
- struct ggml_tensor * opt[GGML_MAX_OPT];
420
-
421
- // thread scheduling
422
- int n_tasks;
423
 
424
  // performance
425
  int perf_runs;
@@ -432,19 +433,27 @@ extern "C" {
432
 
433
  void * extra; // extra things e.g. for ggml-cuda.cu
434
 
435
- char padding[4];
436
  };
437
 
438
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
439
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  // computation graph
441
  struct ggml_cgraph {
442
  int n_nodes;
443
  int n_leafs;
444
- int n_threads;
445
-
446
- size_t work_size;
447
- struct ggml_tensor * work;
448
 
449
  struct ggml_tensor * nodes[GGML_MAX_NODES];
450
  struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -532,6 +541,8 @@ extern "C" {
532
  // use this to compute the memory overhead of a tensor
533
  GGML_API size_t ggml_tensor_overhead(void);
534
 
 
 
535
  GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
536
 
537
  // main
@@ -1292,15 +1303,22 @@ extern "C" {
1292
 
1293
  GGML_API void ggml_set_param(
1294
  struct ggml_context * ctx,
1295
- struct ggml_tensor * tensor);
1296
 
1297
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1298
 
1299
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1300
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1301
 
1302
- GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1303
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
 
 
 
 
 
 
 
1304
 
1305
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1306
 
 
65
  // ggml_set_f32(a, 3.0f);
66
  // ggml_set_f32(b, 4.0f);
67
  //
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
  //
70
  // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
  //
 
132
  // {
133
  // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134
  //
135
+ // // a[2, 1] = 1.0f;
136
  // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137
  //
138
+ // // a[0, 2] = 2.0f;
139
  // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140
  //
141
  // ...
 
197
  #define GGML_MAX_NODES 4096
198
  #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
+ #define GGML_MAX_SRC 6
201
  #define GGML_MAX_NAME 48
202
  #define GGML_DEFAULT_N_THREADS 4
203
 
204
  #define GGML_UNUSED(x) (void)(x)
205
 
206
+ // Maximum training context of the model in use
207
+ // For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
208
+ #ifndef GGML_TRAINING_CTX
209
+ #define GGML_TRAINING_CTX 2048
210
+ #endif
211
+
212
  #define GGML_ASSERT(x) \
213
  do { \
214
  if (!(x)) { \
 
420
  bool is_param;
421
 
422
  struct ggml_tensor * grad;
423
+ struct ggml_tensor * src[GGML_MAX_SRC];
 
 
 
 
 
424
 
425
  // performance
426
  int perf_runs;
 
433
 
434
  void * extra; // extra things e.g. for ggml-cuda.cu
435
 
436
+ char padding[8];
437
  };
438
 
439
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
440
 
441
+ // the compute plan that needs to be prepared for ggml_graph_compute()
442
+ // since https://github.com/ggerganov/ggml/issues/287
443
+ struct ggml_cplan {
444
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
445
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
446
+
447
+ int n_threads;
448
+
449
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
450
+ int n_tasks[GGML_MAX_NODES];
451
+ };
452
+
453
  // computation graph
454
  struct ggml_cgraph {
455
  int n_nodes;
456
  int n_leafs;
 
 
 
 
457
 
458
  struct ggml_tensor * nodes[GGML_MAX_NODES];
459
  struct ggml_tensor * grads[GGML_MAX_NODES];
 
541
  // use this to compute the memory overhead of a tensor
542
  GGML_API size_t ggml_tensor_overhead(void);
543
 
544
+ GGML_API void set_ntk_rope_scale_mode(bool useNtk);
545
+ GGML_API bool get_ntk_rope_scale_mode();
546
  GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
547
 
548
  // main
 
1303
 
1304
  GGML_API void ggml_set_param(
1305
  struct ggml_context * ctx,
1306
+ struct ggml_tensor * tensor);
1307
 
1308
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1309
 
1310
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1311
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1312
 
1313
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1314
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1315
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1316
+ GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1317
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1318
+
1319
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1320
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1321
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1322
 
1323
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1324