Illumotion commited on
Commit
edc20ac
1 Parent(s): 65dc8e0

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitignore CHANGED
@@ -81,12 +81,12 @@ tests/test-tokenizer-0
81
  koboldcpp.so
82
  koboldcpp_failsafe.so
83
  koboldcpp_openblas.so
84
- koboldcpp_openblas_noavx2.so
85
  koboldcpp_clblast.so
86
  koboldcpp.dll
87
  koboldcpp_failsafe.dll
88
  koboldcpp_openblas.dll
89
- koboldcpp_openblas_noavx2.dll
90
  koboldcpp_clblast.dll
91
  koboldcpp_cublas.dll
92
  cublas64_11.dll
 
81
  koboldcpp.so
82
  koboldcpp_failsafe.so
83
  koboldcpp_openblas.so
84
+ koboldcpp_noavx2.so
85
  koboldcpp_clblast.so
86
  koboldcpp.dll
87
  koboldcpp_failsafe.dll
88
  koboldcpp_openblas.dll
89
+ koboldcpp_noavx2.dll
90
  koboldcpp_clblast.dll
91
  koboldcpp_cublas.dll
92
  cublas64_11.dll
CMakeLists.txt CHANGED
@@ -3,9 +3,9 @@
3
  # IT WILL NOT BE UPDATED OR MAINTAINED !!!
4
 
5
  message(STATUS "============== ============== ==============")
6
- message(STATUS "WARNING! Do NOT use this file. It is UNSUPPORTED for normal users. Use MAKE instead.")
7
- message(STATUS "It is ONLY for CUBLAS build testing on windows visual studio. IT WILL NOT BE UPDATED OR MAINTAINED !!!")
8
- message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building AN EXPERIMENAL WINDOWS CUBLAS BUILD! NOTHING ELSE WILL BE SUPPORTED !!!")
9
  message(STATUS "============== ============== ==============")
10
 
11
  cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
@@ -43,11 +43,12 @@ if (NOT MSVC)
43
  endif()
44
 
45
  # 3rd party libs
46
- option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
 
47
  set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
48
  set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
49
  set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
50
- option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
51
  set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
52
  option(LLAMA_K_QUANTS "llama: use k-quants" ON)
53
 
@@ -79,13 +80,15 @@ if (LLAMA_CUBLAS)
79
  set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
80
 
81
  add_compile_definitions(GGML_USE_CUBLAS)
 
 
82
  #add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
83
 
84
  add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
85
  add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
86
  add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
87
- if (LLAMA_CUDA_DMMV_F16)
88
- add_compile_definitions(GGML_CUDA_DMMV_F16)
89
  endif()
90
  add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
91
 
@@ -96,10 +99,19 @@ if (LLAMA_CUBLAS)
96
  endif()
97
 
98
  if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
99
- if (LLAMA_CUDA_DMMV_F16)
100
- set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
 
 
 
 
101
  else()
102
- set(CMAKE_CUDA_ARCHITECTURES "37;52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
 
 
 
 
 
103
  endif()
104
  endif()
105
  message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -120,6 +132,7 @@ if (LLAMA_ALL_WARNINGS)
120
  -Wshadow
121
  -Wstrict-prototypes
122
  -Wpointer-arith
 
123
  )
124
  set(cxx_flags
125
  -Wall
@@ -259,6 +272,8 @@ endif()
259
  add_library(ggml OBJECT
260
  ggml.c
261
  ggml.h
 
 
262
  k_quants.h
263
  k_quants.c
264
  ${GGML_SOURCES_CUDA})
 
3
  # IT WILL NOT BE UPDATED OR MAINTAINED !!!
4
 
5
  message(STATUS "============== ============== ==============")
6
+ message(STATUS "WARNING! Recommend NOT to use this file. It is UNSUPPORTED for normal users. Use MAKE instead.")
7
+ message(STATUS "It is ONLY for CUBLAS builds on windows visual studio. IT WILL OVERWRITE YOUR EXISTING MAKEFILE !!!")
8
+ message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!")
9
  message(STATUS "============== ============== ==============")
10
 
11
  cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
 
43
  endif()
44
 
45
  # 3rd party libs
46
+ option(LLAMA_CUBLAS "llama: use CUDA" ON)
47
+ set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
48
  set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
49
  set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
50
  set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
51
+ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
52
  set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
53
  option(LLAMA_K_QUANTS "llama: use k-quants" ON)
54
 
 
80
  set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
81
 
82
  add_compile_definitions(GGML_USE_CUBLAS)
83
+ #add_compile_definitions(GGML_CUDA_CUBLAS) #remove to not use cublas
84
+ add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
85
  #add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
86
 
87
  add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
88
  add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
89
  add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
90
+ if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
91
+ add_compile_definitions(GGML_CUDA_F16)
92
  endif()
93
  add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
94
 
 
99
  endif()
100
 
101
  if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
102
+ # 52 == lowest CUDA 12 standard
103
+ # 60 == f16 CUDA intrinsics
104
+ # 61 == integer CUDA intrinsics
105
+ # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
106
+ if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
107
+ set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
108
  else()
109
+ message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
110
+ if(CUDAToolkit_VERSION VERSION_GREATER 12)
111
+ set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
112
+ else()
113
+ set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
114
+ endif()
115
  endif()
116
  endif()
117
  message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
132
  -Wshadow
133
  -Wstrict-prototypes
134
  -Wpointer-arith
135
+ -Wmissing-prototypes
136
  )
137
  set(cxx_flags
138
  -Wall
 
272
  add_library(ggml OBJECT
273
  ggml.c
274
  ggml.h
275
+ ggml-alloc.c
276
+ ggml-alloc.h
277
  k_quants.h
278
  k_quants.c
279
  ${GGML_SOURCES_CUDA})
Makefile CHANGED
@@ -1,4 +1,4 @@
1
- default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast koboldcpp_cublas
2
  tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
3
  dev: koboldcpp_openblas
4
  dev2: koboldcpp_clblast
@@ -42,7 +42,7 @@ endif
42
 
43
  # keep standard at C11 and C++11
44
  CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
45
- CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
46
  LDFLAGS =
47
 
48
  # these are used on windows, to build some libraries with extra old device compatibility
@@ -165,20 +165,34 @@ else ifdef LLAMA_CUDA_DMMV_Y
165
  else
166
  NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
167
  endif # LLAMA_CUDA_MMV_Y
 
 
 
168
  ifdef LLAMA_CUDA_DMMV_F16
169
- NVCCFLAGS += -DGGML_CUDA_DMMV_F16
170
  endif # LLAMA_CUDA_DMMV_F16
171
  ifdef LLAMA_CUDA_KQUANTS_ITER
172
  NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
173
  else
174
  NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
175
  endif
 
 
 
 
 
 
 
 
 
 
 
176
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
177
- $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
178
  ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
179
- $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
180
  ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
181
- $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
182
  endif # LLAMA_CUBLAS
183
 
184
  ifdef LLAMA_METAL
@@ -213,7 +227,7 @@ endif
213
  DEFAULT_BUILD =
214
  FAILSAFE_BUILD =
215
  OPENBLAS_BUILD =
216
- OPENBLAS_NOAVX2_BUILD =
217
  CLBLAST_BUILD =
218
  CUBLAS_BUILD =
219
 
@@ -221,7 +235,7 @@ ifeq ($(OS),Windows_NT)
221
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
222
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
223
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
224
- OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
225
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
226
 
227
  ifdef LLAMA_CUBLAS
@@ -233,7 +247,7 @@ else
233
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
234
  ifdef LLAMA_OPENBLAS
235
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
236
- OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
237
  endif
238
  ifdef LLAMA_CLBLAST
239
  ifeq ($(UNAME_S),Darwin)
@@ -283,8 +297,8 @@ ggml_openblas.o: ggml.c ggml.h
283
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
284
  ggml_failsafe.o: ggml.c ggml.h
285
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
286
- ggml_openblas_noavx2.o: ggml.c ggml.h
287
- $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
288
  ggml_clblast.o: ggml.c ggml.h
289
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
290
  ggml_cublas.o: ggml.c ggml.h
@@ -298,6 +312,10 @@ k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
298
  k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
299
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
300
 
 
 
 
 
301
  #version 2 libs
302
  ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
303
  $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
@@ -305,8 +323,8 @@ ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
305
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
306
  ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
307
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
308
- ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
309
- $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
310
  ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
311
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
312
  ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
@@ -327,10 +345,12 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
327
  $(CC) $(CFLAGS) -c $< -o $@
328
 
329
  # intermediate objects
330
- llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
331
  $(CXX) $(CXXFLAGS) -c $< -o $@
332
  common.o: examples/common.cpp examples/common.h
333
  $(CXX) $(CXXFLAGS) -c $< -o $@
 
 
334
  grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
335
  $(CXX) $(CXXFLAGS) -c $< -o $@
336
  expose.o: expose.cpp expose.h
@@ -348,37 +368,37 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
348
  $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
349
 
350
  clean:
351
- rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
352
 
353
- main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o grammar-parser.o $(OBJS)
354
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
355
  @echo
356
  @echo '==== Run ./main -h for help. ===='
357
  @echo
358
 
359
  #generated libraries
360
- koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o $(OBJS)
361
  $(DEFAULT_BUILD)
362
- koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o $(OBJS)
363
  $(OPENBLAS_BUILD)
364
- koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o $(OBJS)
365
  $(FAILSAFE_BUILD)
366
- koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o $(OBJS)
367
- $(OPENBLAS_NOAVX2_BUILD)
368
- koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o $(OBJS)
369
  $(CLBLAST_BUILD)
370
- koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o $(CUBLAS_OBJS) $(OBJS)
371
  $(CUBLAS_BUILD)
372
 
373
- quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o
374
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
375
- quantize_gptj: ggml.o llama.o k_quants.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
376
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
377
- quantize_gpt2: ggml.o llama.o k_quants.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
378
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
379
- quantize_neox: ggml.o llama.o k_quants.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
380
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
381
- quantize_mpt: ggml.o llama.o k_quants.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
382
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
383
 
384
 
 
1
+ default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_cublas
2
  tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
3
  dev: koboldcpp_openblas
4
  dev2: koboldcpp_clblast
 
42
 
43
  # keep standard at C11 and C++11
44
  CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
45
+ CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
46
  LDFLAGS =
47
 
48
  # these are used on windows, to build some libraries with extra old device compatibility
 
165
  else
166
  NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
167
  endif # LLAMA_CUDA_MMV_Y
168
+ ifdef LLAMA_CUDA_F16
169
+ NVCCFLAGS += -DGGML_CUDA_F16
170
+ endif # LLAMA_CUDA_F16
171
  ifdef LLAMA_CUDA_DMMV_F16
172
+ NVCCFLAGS += -DGGML_CUDA_F16
173
  endif # LLAMA_CUDA_DMMV_F16
174
  ifdef LLAMA_CUDA_KQUANTS_ITER
175
  NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
176
  else
177
  NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
178
  endif
179
+ ifdef LLAMA_CUDA_MMQ_Y
180
+ NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
181
+ else
182
+ NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
183
+ endif # LLAMA_CUDA_MMQ_Y
184
+ #ifdef LLAMA_CUDA_CUBLAS
185
+ # NVCCFLAGS += -DGGML_CUDA_CUBLAS
186
+ #endif # LLAMA_CUDA_CUBLAS
187
+ ifdef LLAMA_CUDA_CCBIN
188
+ NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
189
+ endif
190
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
191
+ $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
192
  ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
193
+ $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
194
  ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
195
+ $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
196
  endif # LLAMA_CUBLAS
197
 
198
  ifdef LLAMA_METAL
 
227
  DEFAULT_BUILD =
228
  FAILSAFE_BUILD =
229
  OPENBLAS_BUILD =
230
+ NOAVX2_BUILD =
231
  CLBLAST_BUILD =
232
  CUBLAS_BUILD =
233
 
 
235
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
236
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
237
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
238
+ NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
239
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
240
 
241
  ifdef LLAMA_CUBLAS
 
247
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
248
  ifdef LLAMA_OPENBLAS
249
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
250
+ NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
251
  endif
252
  ifdef LLAMA_CLBLAST
253
  ifeq ($(UNAME_S),Darwin)
 
297
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
298
  ggml_failsafe.o: ggml.c ggml.h
299
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
300
+ ggml_noavx2.o: ggml.c ggml.h
301
+ $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
302
  ggml_clblast.o: ggml.c ggml.h
303
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
304
  ggml_cublas.o: ggml.c ggml.h
 
312
  k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
313
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
314
 
315
+ #there's no intrinsics or special gpu ops used here, so we can have a universal object
316
+ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
317
+ $(CC) $(CFLAGS) -c $< -o $@
318
+
319
  #version 2 libs
320
  ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
321
  $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
 
323
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
324
  ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
325
  $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
326
+ ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
327
+ $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
328
  ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
329
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
330
  ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
 
345
  $(CC) $(CFLAGS) -c $< -o $@
346
 
347
  # intermediate objects
348
+ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
349
  $(CXX) $(CXXFLAGS) -c $< -o $@
350
  common.o: examples/common.cpp examples/common.h
351
  $(CXX) $(CXXFLAGS) -c $< -o $@
352
+ console.o: examples/console.cpp examples/console.h
353
+ $(CXX) $(CXXFLAGS) -c $< -o $@
354
  grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
355
  $(CXX) $(CXXFLAGS) -c $< -o $@
356
  expose.o: expose.cpp expose.h
 
368
  $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
369
 
370
  clean:
371
+ rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
372
 
373
+ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
374
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
375
  @echo
376
  @echo '==== Run ./main -h for help. ===='
377
  @echo
378
 
379
  #generated libraries
380
+ koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS)
381
  $(DEFAULT_BUILD)
382
+ koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o $(OBJS)
383
  $(OPENBLAS_BUILD)
384
+ koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o $(OBJS)
385
  $(FAILSAFE_BUILD)
386
+ koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o $(OBJS)
387
+ $(NOAVX2_BUILD)
388
+ koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o $(OBJS)
389
  $(CLBLAST_BUILD)
390
+ koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(OBJS)
391
  $(CUBLAS_BUILD)
392
 
393
+ quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
394
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
395
+ quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
396
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
397
+ quantize_gpt2: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
398
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
399
+ quantize_neox: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
400
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
401
+ quantize_mpt: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
402
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
403
 
404
 
README.md CHANGED
@@ -1,6 +1,80 @@
1
- ---
2
- title: Koboldcpp
3
- sdk: docker
4
- colorFrom: blue
5
- colorTo: green
6
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # koboldcpp
2
+
3
+ KoboldCpp is an easy-to-use AI text-generation software for GGML models. It's a single self contained distributable from Concedo, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer.
4
+
5
+ ![Preview](media/preview.png)
6
+
7
+ ## Usage
8
+ - **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo.
9
+ - Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts.
10
+ - Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke).
11
+ - To run, execute **koboldcpp.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. If you're not on windows, then run the script **KoboldCpp.py** after compiling the libraries.
12
+ - Launching with no command line arguments displays a GUI containing a subset of configurable settings. Generally you dont have to change much besides the `Presets` and `GPU Layers`. Read the `--help` for more info about each settings.
13
+ - By default, you can connect to http://localhost:5001
14
+ - You can also run it using the command line `koboldcpp.exe [ggml_model.bin] [port]`. For info, please check `koboldcpp.exe --help`
15
+ - Default context size to small? Try `--contextsize 3072` to 1.5x your context size! without much perplexity gain. Note that you'll have to increase the max context in the Kobold Lite UI as well (click and edit the number text field).
16
+ - Big context too slow? Try the `--smartcontext` flag to reduce prompt processing frequency. Also, you can try to run with your GPU using CLBlast, with `--useclblast` flag for a speedup
17
+ - Want even more speedup? Combine `--useclblast` with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory.
18
+ - If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`.
19
+
20
+ For more information, be sure to run the program with the `--help` flag.
21
+
22
+ ## OSX and Linux
23
+ - You will have to compile your binaries from source. A makefile is provided, simply run `make`
24
+ - If you want you can also link your own install of OpenBLAS manually with `make LLAMA_OPENBLAS=1`
25
+ - Alternatively, if you want you can also link your own install of CLBlast manually with `make LLAMA_CLBLAST=1`, for this you will need to obtain and link OpenCL and CLBlast libraries.
26
+ - For Arch Linux: Install `cblas` `openblas` and `clblast`.
27
+ - For Debian: Install `libclblast-dev` and `libopenblas-dev`.
28
+ - For a full featured build, do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1`
29
+ - After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]`
30
+ - Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
31
+
32
+ ## Compiling on Windows
33
+ - You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is:
34
+ - Use the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs!
35
+ - Make sure you are using the w64devkit integrated terminal, then run 'make' at the KoboldCpp source folder. This will create the .dll files.
36
+ - If you want to generate the .exe file, make sure you have the python module PyInstaller installed with pip ('pip install PyInstaller').
37
+ - Run the script make_pyinstaller.bat at a regular terminal (or Windows Explorer).
38
+ - The koboldcpp.exe file will be at your dist folder.
39
+ - If you wish to use your own version of the additional Windows libraries (OpenCL, CLBlast and OpenBLAS), you can do it with:
40
+ - OpenCL - tested with https://github.com/KhronosGroup/OpenCL-SDK . If you wish to compile it, follow the repository instructions. You will need vcpkg.
41
+ - CLBlast - tested with https://github.com/CNugteren/CLBlast . If you wish to compile it you will need to reference the OpenCL files. It will only generate the ".lib" file if you compile using MSVC.
42
+ - OpenBLAS - tested with https://github.com/xianyi/OpenBLAS .
43
+ - Move the respectives .lib files to the /lib folder of your project, overwriting the older files.
44
+ - Also, replace the existing versions of the corresponding .dll files located in the project directory root (e.g. libopenblas.dll).
45
+ - Make the KoboldCPP project using the instructions above.
46
+
47
+ ## Android (Termux) Alternative method
48
+ - See https://github.com/ggerganov/llama.cpp/pull/1828/files
49
+
50
+ ## Using CuBLAS
51
+ - If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support.
52
+ - You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC.
53
+
54
+ ## Questions and Help
55
+ - **First, please check out [The KoboldCpp FAQ and Knowledgebase](https://github.com/LostRuins/koboldcpp/wiki) which may already have answers to your questions! Also please search through past issues and discussions.**
56
+ - If you cannot find an answer, open an issue on this github, or find us on the [KoboldAI Discord](https://koboldai.org/discord).
57
+
58
+ ## Considerations
59
+ - For Windows: No installation, single file executable, (It Just Works)
60
+ - Since v1.0.6, requires libopenblas, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without BLAS.
61
+ - Since v1.15, requires CLBlast if enabled, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without CLBlast.
62
+ - Since v1.33, you can set the context size to be above what the model supports officially. It does increases perplexity but should still work well below 4096 even on untuned models. (For GPT-NeoX, GPT-J, and LLAMA models) Customize this with `--ropeconfig`.
63
+ - **I plan to keep backwards compatibility with ALL past llama.cpp AND alpaca.cpp models**. But you are also encouraged to reconvert/update your models if possible for best results.
64
+
65
+ ## License
66
+ - The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
67
+ - However, Kobold Lite is licensed under the AGPL v3.0 License
68
+ - The other files are also under the AGPL v3.0 License unless otherwise stated
69
+
70
+ ## Notes
71
+ - Generation delay scales linearly with original prompt length. If OpenBLAS is enabled then prompt ingestion becomes about 2-3x faster. This is automatic on windows, but will require linking on OSX and Linux. CLBlast speeds this up even further, and `--gpulayers` + `--useclblast` more so.
72
+ - I have heard of someone claiming a false AV positive report. The exe is a simple pyinstaller bundle that includes the necessary python scripts and dlls to run. If this still concerns you, you might wish to rebuild everything from source code using the makefile, and you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat`
73
+ - Supported GGML models (Includes backward compatibility for older versions/legacy GGML models, though some newer features might be unavailable):
74
+ - LLAMA and LLAMA2 (LLaMA / Alpaca / GPT4All / Vicuna / Koala / Pygmalion 7B / Metharme 7B / WizardLM and many more)
75
+ - GPT-2 / Cerebras
76
+ - GPT-J
77
+ - RWKV
78
+ - GPT-NeoX / Pythia / StableLM / Dolly / RedPajama
79
+ - MPT models
80
+
convert.py CHANGED
@@ -133,7 +133,7 @@ TENSORS_SET = set(TENSORS_LIST)
133
 
134
  def find_n_mult(n_ff: int, n_embd: int) -> int:
135
  # hardcoded magic range
136
- for n_mult in range(256, 1, -1):
137
  calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
138
  if calc_ff == n_ff:
139
  return n_mult
@@ -141,11 +141,12 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
141
 
142
  @dataclass
143
  class Params:
144
- n_vocab: int
145
- n_embd: int
146
- n_mult: int
147
- n_head: int
148
- n_layer: int
 
149
 
150
  @staticmethod
151
  def guessed(model: 'LazyModel') -> 'Params':
@@ -167,11 +168,12 @@ class Params:
167
  n_head=n_embd // 128 # guessed
168
 
169
  return Params(
170
- n_vocab = n_vocab,
171
- n_embd = n_embd,
172
- n_mult = 256,
173
- n_head = n_head,
174
- n_layer = n_layer,
 
175
  )
176
 
177
  @staticmethod
@@ -183,15 +185,17 @@ class Params:
183
  n_head = config["num_attention_heads"];
184
  n_layer = config["num_hidden_layers"];
185
  n_ff = config["intermediate_size"];
 
186
 
187
  n_mult = find_n_mult(n_ff, n_embd);
188
 
189
  return Params(
190
- n_vocab = n_vocab,
191
- n_embd = n_embd,
192
- n_mult = n_mult,
193
- n_head = n_head,
194
- n_layer = n_layer,
 
195
  )
196
 
197
  # LLaMA v2 70B params.json
@@ -200,21 +204,22 @@ class Params:
200
  def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
201
  config = json.load(open(config_path))
202
 
203
- n_vocab = config["vocab_size"];
204
- n_embd = config["dim"];
205
- n_head = config["n_heads"];
206
- n_layer = config["n_layers"];
207
- n_mult = config["multiple_of"];
208
 
209
  if n_vocab == -1:
210
  n_vocab = model["tok_embeddings.weight"].shape[0]
211
 
212
  return Params(
213
- n_vocab = n_vocab,
214
- n_embd = n_embd,
215
- n_mult = n_mult,
216
- n_head = n_head,
217
- n_layer = n_layer,
 
218
  )
219
 
220
  @staticmethod
@@ -317,10 +322,12 @@ class GGMLVocab:
317
  Vocab = Union[SentencePieceVocab, GGMLVocab]
318
 
319
 
320
- def permute(weights: NDArray, n_head: int) -> NDArray:
 
 
321
  return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
322
- .swapaxes(1, 2)
323
- .reshape(weights.shape))
324
 
325
 
326
  def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
@@ -368,7 +375,7 @@ class Tensor(metaclass=ABCMeta):
368
  @abstractmethod
369
  def astype(self, data_type: DataType) -> 'Tensor': ...
370
  @abstractmethod
371
- def permute(self, n_head: int) -> 'Tensor': ...
372
  @abstractmethod
373
  def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
374
  @abstractmethod
@@ -406,8 +413,8 @@ class UnquantizedTensor(Tensor):
406
  r = self.ndarray.shape[0] // 3
407
  return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
408
 
409
- def permute(self, n_head: int) -> 'UnquantizedTensor':
410
- return UnquantizedTensor(permute(self.ndarray, n_head))
411
 
412
 
413
  def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
@@ -455,26 +462,34 @@ class GGMLQuantizedTensor(Tensor):
455
  def to_ggml(self) -> 'GGMLQuantizedTensor':
456
  return self
457
 
458
- def permute(self, n_head: int) -> 'GGMLQuantizedTensor':
459
- return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type)
 
 
 
 
460
 
 
 
 
461
 
462
  GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
463
 
464
 
465
  class DeferredPermutedTensor(Tensor):
466
- def __init__(self, base: Tensor, n_head: int) -> None:
467
  self.base = base
468
  self.n_head = n_head
 
469
  self.data_type = self.base.data_type
470
 
471
  def astype(self, data_type: DataType) -> Tensor:
472
- return self.base.astype(data_type).permute(self.n_head)
473
 
474
  def to_ggml(self) -> GGMLCompatibleTensor:
475
- return self.base.to_ggml().permute(self.n_head)
476
 
477
- def permute(self, n_head: int) -> Tensor:
478
  raise Exception("shouldn't permute twice")
479
 
480
 
@@ -566,8 +581,8 @@ class GPTQForLLaMaQuantizedTensor(Tensor):
566
  ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
567
  return ret
568
 
569
- def permute(self, n_head: int) -> Tensor:
570
- return DeferredPermutedTensor(self, n_head)
571
 
572
  def to_ggml(self) -> GGMLQuantizedTensor:
573
  # The output format looks like this:
@@ -698,10 +713,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
698
  return ModelPlus(model, paths, format, vocab)
699
 
700
 
701
- def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
702
  def load() -> Tensor:
703
- return lazy_tensor.load().permute(n_head)
704
- return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
705
 
706
  def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
707
  def load() -> Tensor:
@@ -726,7 +741,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
726
  for i in itertools.count():
727
  if f"model.layers.{i}.self_attn.q_proj.weight" in model:
728
  out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
729
- out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
730
  out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
731
  elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
732
  out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
 
133
 
134
  def find_n_mult(n_ff: int, n_embd: int) -> int:
135
  # hardcoded magic range
136
+ for n_mult in range(8192, 1, -1):
137
  calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
138
  if calc_ff == n_ff:
139
  return n_mult
 
141
 
142
  @dataclass
143
  class Params:
144
+ n_vocab: int
145
+ n_embd: int
146
+ n_mult: int
147
+ n_head: int
148
+ n_layer: int
149
+ n_kv_head: Optional[int] # This parameter is only used for Llama 2
150
 
151
  @staticmethod
152
  def guessed(model: 'LazyModel') -> 'Params':
 
168
  n_head=n_embd // 128 # guessed
169
 
170
  return Params(
171
+ n_vocab = n_vocab,
172
+ n_embd = n_embd,
173
+ n_mult = 256,
174
+ n_head = n_head,
175
+ n_layer = n_layer,
176
+ n_kv_head = None,
177
  )
178
 
179
  @staticmethod
 
185
  n_head = config["num_attention_heads"];
186
  n_layer = config["num_hidden_layers"];
187
  n_ff = config["intermediate_size"];
188
+ n_kv_head = config.get("num_key_value_heads")
189
 
190
  n_mult = find_n_mult(n_ff, n_embd);
191
 
192
  return Params(
193
+ n_vocab = n_vocab,
194
+ n_embd = n_embd,
195
+ n_mult = n_mult,
196
+ n_head = n_head,
197
+ n_layer = n_layer,
198
+ n_kv_head = n_kv_head,
199
  )
200
 
201
  # LLaMA v2 70B params.json
 
204
  def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
205
  config = json.load(open(config_path))
206
 
207
+ n_vocab = config["vocab_size"];
208
+ n_embd = config["dim"];
209
+ n_head = config["n_heads"];
210
+ n_layer = config["n_layers"];
211
+ n_mult = config["multiple_of"];
212
 
213
  if n_vocab == -1:
214
  n_vocab = model["tok_embeddings.weight"].shape[0]
215
 
216
  return Params(
217
+ n_vocab = n_vocab,
218
+ n_embd = n_embd,
219
+ n_mult = n_mult,
220
+ n_head = n_head,
221
+ n_layer = n_layer,
222
+ n_kv_head = None,
223
  )
224
 
225
  @staticmethod
 
322
  Vocab = Union[SentencePieceVocab, GGMLVocab]
323
 
324
 
325
+ def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
326
+ if n_kv_head is not None and n_head != n_kv_head:
327
+ n_head //= n_kv_head
328
  return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
329
+ .swapaxes(1, 2)
330
+ .reshape(weights.shape))
331
 
332
 
333
  def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
 
375
  @abstractmethod
376
  def astype(self, data_type: DataType) -> 'Tensor': ...
377
  @abstractmethod
378
+ def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
379
  @abstractmethod
380
  def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
381
  @abstractmethod
 
413
  r = self.ndarray.shape[0] // 3
414
  return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
415
 
416
+ def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
417
+ return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
418
 
419
 
420
  def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
 
462
  def to_ggml(self) -> 'GGMLQuantizedTensor':
463
  return self
464
 
465
+ def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
466
+ return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
467
+
468
+ def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
469
+ r = self.ndarray.shape[0] // 3
470
+ return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
471
 
472
+ def part(self, n_part: int) -> 'UnquantizedTensor':
473
+ r = self.ndarray.shape[0] // 3
474
+ return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
475
 
476
  GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
477
 
478
 
479
  class DeferredPermutedTensor(Tensor):
480
+ def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
481
  self.base = base
482
  self.n_head = n_head
483
+ self.n_kv_head = n_kv_head
484
  self.data_type = self.base.data_type
485
 
486
  def astype(self, data_type: DataType) -> Tensor:
487
+ return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
488
 
489
  def to_ggml(self) -> GGMLCompatibleTensor:
490
+ return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
491
 
492
+ def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
493
  raise Exception("shouldn't permute twice")
494
 
495
 
 
581
  ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
582
  return ret
583
 
584
+ def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
585
+ return DeferredPermutedTensor(self, n_head, n_kv_head)
586
 
587
  def to_ggml(self) -> GGMLQuantizedTensor:
588
  # The output format looks like this:
 
713
  return ModelPlus(model, paths, format, vocab)
714
 
715
 
716
+ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
717
  def load() -> Tensor:
718
+ return lazy_tensor.load().permute(n_head, n_kv_head)
719
+ return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
720
 
721
  def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
722
  def load() -> Tensor:
 
741
  for i in itertools.count():
742
  if f"model.layers.{i}.self_attn.q_proj.weight" in model:
743
  out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
744
+ out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
745
  out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
746
  elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
747
  out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
examples/CMakeLists.txt CHANGED
@@ -13,6 +13,8 @@ set(TARGET common)
13
  add_library(${TARGET} OBJECT
14
  common.h
15
  common.cpp
 
 
16
  grammar-parser.h
17
  grammar-parser.cpp
18
  )
 
13
  add_library(${TARGET} OBJECT
14
  common.h
15
  common.cpp
16
+ console.h
17
+ console.cpp
18
  grammar-parser.h
19
  grammar-parser.cpp
20
  )
examples/common.cpp CHANGED
@@ -25,7 +25,6 @@
25
  #else
26
  #include <sys/ioctl.h>
27
  #include <unistd.h>
28
- #include <wchar.h>
29
  #endif
30
 
31
  #if defined(_MSC_VER)
@@ -195,6 +194,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
195
  break;
196
  }
197
  params.rope_freq_scale = std::stof(argv[i]);
 
 
 
 
 
 
198
  } else if (arg == "--memory-f32") {
199
  params.memory_f16 = false;
200
  } else if (arg == "--top-p") {
@@ -329,6 +334,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
329
  params.instruct = true;
330
  } else if (arg == "--multiline-input") {
331
  params.multiline_input = true;
 
 
332
  } else if (arg == "--color") {
333
  params.use_color = true;
334
  } else if (arg == "--mlock") {
@@ -352,7 +359,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
352
  #ifdef GGML_USE_CUBLAS
353
  params.main_gpu = std::stoi(argv[i]);
354
  #else
355
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
356
  #endif
357
  } else if (arg == "--tensor-split" || arg == "-ts") {
358
  if (++i >= argc) {
@@ -376,13 +383,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
376
  }
377
  }
378
  #else
379
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 
 
 
 
 
 
380
  #endif // GGML_USE_CUBLAS
381
  } else if (arg == "--low-vram" || arg == "-lv") {
382
  #ifdef GGML_USE_CUBLAS
383
  params.low_vram = true;
384
  #else
385
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
386
  #endif // GGML_USE_CUBLAS
387
  } else if (arg == "--no-mmap") {
388
  params.use_mmap = false;
@@ -402,8 +415,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
402
  params.antiprompt.push_back(argv[i]);
403
  } else if (arg == "--perplexity") {
404
  params.perplexity = true;
405
- } else if (arg == "--perplexity-lines") {
406
- params.perplexity_lines = true;
 
 
 
 
 
 
407
  } else if (arg == "--ignore-eos") {
408
  params.logit_bias[llama_token_eos()] = -INFINITY;
409
  } else if (arg == "--no-penalize-nl") {
@@ -551,16 +570,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
551
  fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
552
  fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
553
  fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
554
- fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
555
- fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
 
556
  fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
557
  fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
558
  fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
559
  fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
560
  fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
561
  fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
562
- fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n");
563
- fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
 
564
  fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
565
  if (llama_mlock_supported()) {
566
  fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
@@ -578,10 +599,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
578
  fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
579
  fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
580
  fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
 
 
 
581
  #endif
582
  fprintf(stdout, " --mtest compute maximum memory usage\n");
583
  fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
584
  fprintf(stdout, " --verbose-prompt print prompt before generation\n");
 
585
  fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
586
  fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
587
  fprintf(stdout, " -m FNAME, --model FNAME\n");
@@ -630,6 +655,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
630
  lparams.main_gpu = params.main_gpu;
631
  lparams.tensor_split = params.tensor_split;
632
  lparams.low_vram = params.low_vram;
 
633
  lparams.seed = params.seed;
634
  lparams.f16_kv = params.memory_f16;
635
  lparams.use_mmap = params.use_mmap;
@@ -673,376 +699,3 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
673
 
674
  return std::make_tuple(model, lctx);
675
  }
676
-
677
- void console_init(console_state & con_st) {
678
- #if defined(_WIN32)
679
- // Windows-specific console initialization
680
- DWORD dwMode = 0;
681
- con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
682
- if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
683
- con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
684
- if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
685
- con_st.hConsole = NULL;
686
- }
687
- }
688
- if (con_st.hConsole) {
689
- // Enable ANSI colors on Windows 10+
690
- if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
691
- SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
692
- }
693
- // Set console output codepage to UTF8
694
- SetConsoleOutputCP(CP_UTF8);
695
- }
696
- HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
697
- if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
698
- // Set console input codepage to UTF16
699
- _setmode(_fileno(stdin), _O_WTEXT);
700
-
701
- // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
702
- dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
703
- SetConsoleMode(hConIn, dwMode);
704
- }
705
- #else
706
- // POSIX-specific console initialization
707
- struct termios new_termios;
708
- tcgetattr(STDIN_FILENO, &con_st.prev_state);
709
- new_termios = con_st.prev_state;
710
- new_termios.c_lflag &= ~(ICANON | ECHO);
711
- new_termios.c_cc[VMIN] = 1;
712
- new_termios.c_cc[VTIME] = 0;
713
- tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
714
-
715
- con_st.tty = fopen("/dev/tty", "w+");
716
- if (con_st.tty != nullptr) {
717
- con_st.out = con_st.tty;
718
- }
719
-
720
- setlocale(LC_ALL, "");
721
- #endif
722
- }
723
-
724
- void console_cleanup(console_state & con_st) {
725
- // Reset console color
726
- console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
727
-
728
- #if !defined(_WIN32)
729
- if (con_st.tty != nullptr) {
730
- con_st.out = stdout;
731
- fclose(con_st.tty);
732
- con_st.tty = nullptr;
733
- }
734
- // Restore the terminal settings on POSIX systems
735
- tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
736
- #endif
737
- }
738
-
739
- /* Keep track of current color of output, and emit ANSI code if it changes. */
740
- void console_set_color(console_state & con_st, console_color_t color) {
741
- if (con_st.use_color && con_st.color != color) {
742
- fflush(stdout);
743
- switch(color) {
744
- case CONSOLE_COLOR_DEFAULT:
745
- fprintf(con_st.out, ANSI_COLOR_RESET);
746
- break;
747
- case CONSOLE_COLOR_PROMPT:
748
- fprintf(con_st.out, ANSI_COLOR_YELLOW);
749
- break;
750
- case CONSOLE_COLOR_USER_INPUT:
751
- fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
752
- break;
753
- case CONSOLE_COLOR_ERROR:
754
- fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
755
- break;
756
- }
757
- con_st.color = color;
758
- fflush(con_st.out);
759
- }
760
- }
761
-
762
- char32_t getchar32() {
763
- #if defined(_WIN32)
764
- HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
765
- wchar_t high_surrogate = 0;
766
-
767
- while (true) {
768
- INPUT_RECORD record;
769
- DWORD count;
770
- if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
771
- return WEOF;
772
- }
773
-
774
- if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
775
- wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
776
- if (wc == 0) {
777
- continue;
778
- }
779
-
780
- if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
781
- high_surrogate = wc;
782
- continue;
783
- } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
784
- if (high_surrogate != 0) { // Check if we have a high surrogate
785
- return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
786
- }
787
- }
788
-
789
- high_surrogate = 0; // Reset the high surrogate
790
- return static_cast<char32_t>(wc);
791
- }
792
- }
793
- #else
794
- wchar_t wc = getwchar();
795
- if (static_cast<wint_t>(wc) == WEOF) {
796
- return WEOF;
797
- }
798
-
799
- #if WCHAR_MAX == 0xFFFF
800
- if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
801
- wchar_t low_surrogate = getwchar();
802
- if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
803
- return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
804
- }
805
- }
806
- if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
807
- return 0xFFFD; // Return the replacement character U+FFFD
808
- }
809
- #endif
810
-
811
- return static_cast<char32_t>(wc);
812
- #endif
813
- }
814
-
815
- void pop_cursor(console_state & con_st) {
816
- #if defined(_WIN32)
817
- if (con_st.hConsole != NULL) {
818
- CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
819
- GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
820
-
821
- COORD newCursorPosition = bufferInfo.dwCursorPosition;
822
- if (newCursorPosition.X == 0) {
823
- newCursorPosition.X = bufferInfo.dwSize.X - 1;
824
- newCursorPosition.Y -= 1;
825
- } else {
826
- newCursorPosition.X -= 1;
827
- }
828
-
829
- SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
830
- return;
831
- }
832
- #endif
833
- putc('\b', con_st.out);
834
- }
835
-
836
- int estimateWidth(char32_t codepoint) {
837
- #if defined(_WIN32)
838
- return 1;
839
- #else
840
- return wcwidth(codepoint);
841
- #endif
842
- }
843
-
844
- int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
845
- #if defined(_WIN32)
846
- CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
847
- if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
848
- // go with the default
849
- return expectedWidth;
850
- }
851
- COORD initialPosition = bufferInfo.dwCursorPosition;
852
- DWORD nNumberOfChars = length;
853
- WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
854
-
855
- CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
856
- GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
857
-
858
- // Figure out our real position if we're in the last column
859
- if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
860
- DWORD nNumberOfChars;
861
- WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL);
862
- GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
863
- }
864
-
865
- int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
866
- if (width < 0) {
867
- width += newBufferInfo.dwSize.X;
868
- }
869
- return width;
870
- #else
871
- // we can trust expectedWidth if we've got one
872
- if (expectedWidth >= 0 || con_st.tty == nullptr) {
873
- fwrite(utf8_codepoint, length, 1, con_st.out);
874
- return expectedWidth;
875
- }
876
-
877
- fputs("\033[6n", con_st.tty); // Query cursor position
878
- int x1, x2, y1, y2;
879
- int results = 0;
880
- results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
881
-
882
- fwrite(utf8_codepoint, length, 1, con_st.tty);
883
-
884
- fputs("\033[6n", con_st.tty); // Query cursor position
885
- results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
886
-
887
- if (results != 4) {
888
- return expectedWidth;
889
- }
890
-
891
- int width = x2 - x1;
892
- if (width < 0) {
893
- // Calculate the width considering text wrapping
894
- struct winsize w;
895
- ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
896
- width += w.ws_col;
897
- }
898
- return width;
899
- #endif
900
- }
901
-
902
- void replace_last(console_state & con_st, char ch) {
903
- #if defined(_WIN32)
904
- pop_cursor(con_st);
905
- put_codepoint(con_st, &ch, 1, 1);
906
- #else
907
- fprintf(con_st.out, "\b%c", ch);
908
- #endif
909
- }
910
-
911
- void append_utf8(char32_t ch, std::string & out) {
912
- if (ch <= 0x7F) {
913
- out.push_back(static_cast<unsigned char>(ch));
914
- } else if (ch <= 0x7FF) {
915
- out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
916
- out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
917
- } else if (ch <= 0xFFFF) {
918
- out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
919
- out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
920
- out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
921
- } else if (ch <= 0x10FFFF) {
922
- out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
923
- out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
924
- out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
925
- out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
926
- } else {
927
- // Invalid Unicode code point
928
- }
929
- }
930
-
931
- // Helper function to remove the last UTF-8 character from a string
932
- void pop_back_utf8_char(std::string & line) {
933
- if (line.empty()) {
934
- return;
935
- }
936
-
937
- size_t pos = line.length() - 1;
938
-
939
- // Find the start of the last UTF-8 character (checking up to 4 bytes back)
940
- for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
941
- if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character
942
- }
943
- line.erase(pos);
944
- }
945
-
946
- bool console_readline(console_state & con_st, std::string & line) {
947
- console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
948
- if (con_st.out != stdout) {
949
- fflush(stdout);
950
- }
951
-
952
- line.clear();
953
- std::vector<int> widths;
954
- bool is_special_char = false;
955
- bool end_of_stream = false;
956
-
957
- char32_t input_char;
958
- while (true) {
959
- fflush(con_st.out); // Ensure all output is displayed before waiting for input
960
- input_char = getchar32();
961
-
962
- if (input_char == '\r' || input_char == '\n') {
963
- break;
964
- }
965
-
966
- if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
967
- end_of_stream = true;
968
- break;
969
- }
970
-
971
- if (is_special_char) {
972
- console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
973
- replace_last(con_st, line.back());
974
- is_special_char = false;
975
- }
976
-
977
- if (input_char == '\033') { // Escape sequence
978
- char32_t code = getchar32();
979
- if (code == '[' || code == 0x1B) {
980
- // Discard the rest of the escape sequence
981
- while ((code = getchar32()) != (char32_t) WEOF) {
982
- if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
983
- break;
984
- }
985
- }
986
- }
987
- } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
988
- if (!widths.empty()) {
989
- int count;
990
- do {
991
- count = widths.back();
992
- widths.pop_back();
993
- // Move cursor back, print space, and move cursor back again
994
- for (int i = 0; i < count; i++) {
995
- replace_last(con_st, ' ');
996
- pop_cursor(con_st);
997
- }
998
- pop_back_utf8_char(line);
999
- } while (count == 0 && !widths.empty());
1000
- }
1001
- } else {
1002
- int offset = line.length();
1003
- append_utf8(input_char, line);
1004
- int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
1005
- if (width < 0) {
1006
- width = 0;
1007
- }
1008
- widths.push_back(width);
1009
- }
1010
-
1011
- if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
1012
- console_set_color(con_st, CONSOLE_COLOR_PROMPT);
1013
- replace_last(con_st, line.back());
1014
- is_special_char = true;
1015
- }
1016
- }
1017
-
1018
- bool has_more = con_st.multiline_input;
1019
- if (is_special_char) {
1020
- replace_last(con_st, ' ');
1021
- pop_cursor(con_st);
1022
-
1023
- char last = line.back();
1024
- line.pop_back();
1025
- if (last == '\\') {
1026
- line += '\n';
1027
- fputc('\n', con_st.out);
1028
- has_more = !has_more;
1029
- } else {
1030
- // llama will just eat the single space, it won't act as a space
1031
- if (line.length() == 1 && line.back() == ' ') {
1032
- line.clear();
1033
- pop_cursor(con_st);
1034
- }
1035
- has_more = false;
1036
- }
1037
- } else {
1038
- if (end_of_stream) {
1039
- has_more = false;
1040
- } else {
1041
- line += '\n';
1042
- fputc('\n', con_st.out);
1043
- }
1044
- }
1045
-
1046
- fflush(con_st.out);
1047
- return has_more;
1048
- }
 
25
  #else
26
  #include <sys/ioctl.h>
27
  #include <unistd.h>
 
28
  #endif
29
 
30
  #if defined(_MSC_VER)
 
194
  break;
195
  }
196
  params.rope_freq_scale = std::stof(argv[i]);
197
+ } else if (arg == "--rope-scale") {
198
+ if (++i >= argc) {
199
+ invalid_param = true;
200
+ break;
201
+ }
202
+ params.rope_freq_scale = 1.0f/std::stof(argv[i]);
203
  } else if (arg == "--memory-f32") {
204
  params.memory_f16 = false;
205
  } else if (arg == "--top-p") {
 
334
  params.instruct = true;
335
  } else if (arg == "--multiline-input") {
336
  params.multiline_input = true;
337
+ } else if (arg == "--simple-io") {
338
+ params.simple_io = true;
339
  } else if (arg == "--color") {
340
  params.use_color = true;
341
  } else if (arg == "--mlock") {
 
359
  #ifdef GGML_USE_CUBLAS
360
  params.main_gpu = std::stoi(argv[i]);
361
  #else
362
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
363
  #endif
364
  } else if (arg == "--tensor-split" || arg == "-ts") {
365
  if (++i >= argc) {
 
383
  }
384
  }
385
  #else
386
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
387
+ #endif // GGML_USE_CUBLAS
388
+ } else if (arg == "--mul-mat-q" || arg == "-mmq") {
389
+ #ifdef GGML_USE_CUBLAS
390
+ params.mul_mat_q = true;
391
+ #else
392
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
393
  #endif // GGML_USE_CUBLAS
394
  } else if (arg == "--low-vram" || arg == "-lv") {
395
  #ifdef GGML_USE_CUBLAS
396
  params.low_vram = true;
397
  #else
398
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
399
  #endif // GGML_USE_CUBLAS
400
  } else if (arg == "--no-mmap") {
401
  params.use_mmap = false;
 
415
  params.antiprompt.push_back(argv[i]);
416
  } else if (arg == "--perplexity") {
417
  params.perplexity = true;
418
+ } else if (arg == "--hellaswag") {
419
+ params.hellaswag = true;
420
+ } else if (arg == "--hellaswag-tasks") {
421
+ if (++i >= argc) {
422
+ invalid_param = true;
423
+ break;
424
+ }
425
+ params.hellaswag_tasks = std::stoi(argv[i]);
426
  } else if (arg == "--ignore-eos") {
427
  params.logit_bias[llama_token_eos()] = -INFINITY;
428
  } else if (arg == "--no-penalize-nl") {
 
570
  fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
571
  fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
572
  fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
573
+ fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
574
+ fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
575
+ fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
576
  fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
577
  fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
578
  fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
579
  fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
580
  fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
581
  fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
582
+ fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
583
+ fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
584
+ fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
585
  fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
586
  if (llama_mlock_supported()) {
587
  fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
 
599
  fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
600
  fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
601
  fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
602
+ fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
603
+ fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
604
+ fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
605
  #endif
606
  fprintf(stdout, " --mtest compute maximum memory usage\n");
607
  fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
608
  fprintf(stdout, " --verbose-prompt print prompt before generation\n");
609
+ fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
610
  fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
611
  fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
612
  fprintf(stdout, " -m FNAME, --model FNAME\n");
 
655
  lparams.main_gpu = params.main_gpu;
656
  lparams.tensor_split = params.tensor_split;
657
  lparams.low_vram = params.low_vram;
658
+ lparams.mul_mat_q = params.mul_mat_q;
659
  lparams.seed = params.seed;
660
  lparams.f16_kv = params.memory_f16;
661
  lparams.use_mmap = params.use_mmap;
 
699
 
700
  return std::make_tuple(model, lctx);
701
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/common.h CHANGED
@@ -11,11 +11,6 @@
11
  #include <unordered_map>
12
  #include <tuple>
13
 
14
- #if !defined (_WIN32)
15
- #include <stdio.h>
16
- #include <termios.h>
17
- #endif
18
-
19
  //
20
  // CLI argument parsing
21
  //
@@ -70,7 +65,11 @@ struct gpt_params {
70
  std::string lora_adapter = ""; // lora adapter path
71
  std::string lora_base = ""; // base model path for the lora adapter
72
 
73
- bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
 
 
 
 
74
  bool memory_f16 = true; // use f16 instead of f32 for memory kv
75
  bool random_prompt = false; // do not randomize prompt if none provided
76
  bool use_color = false; // use color to distinguish generations and inputs
@@ -81,12 +80,12 @@ struct gpt_params {
81
  bool embedding = false; // get only sentence embedding
82
  bool interactive_first = false; // wait for user input immediately
83
  bool multiline_input = false; // reverse the usage of `\`
 
84
 
85
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
86
  bool instruct = false; // instruction mode (used for Alpaca models)
87
  bool penalize_nl = true; // consider newlines as a repeatable token
88
  bool perplexity = false; // compute perplexity over the prompt
89
- bool perplexity_lines = false; // compute perplexity over each line of the prompt
90
  bool use_mmap = true; // use mmap for faster loads
91
  bool use_mlock = false; // use mlock to keep model in memory
92
  bool mem_test = false; // compute maximum memory usage
@@ -113,42 +112,3 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
113
 
114
  std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
115
  struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
116
-
117
- //
118
- // Console utils
119
- //
120
-
121
- #define ANSI_COLOR_RED "\x1b[31m"
122
- #define ANSI_COLOR_GREEN "\x1b[32m"
123
- #define ANSI_COLOR_YELLOW "\x1b[33m"
124
- #define ANSI_COLOR_BLUE "\x1b[34m"
125
- #define ANSI_COLOR_MAGENTA "\x1b[35m"
126
- #define ANSI_COLOR_CYAN "\x1b[36m"
127
- #define ANSI_COLOR_RESET "\x1b[0m"
128
- #define ANSI_BOLD "\x1b[1m"
129
-
130
- enum console_color_t {
131
- CONSOLE_COLOR_DEFAULT=0,
132
- CONSOLE_COLOR_PROMPT,
133
- CONSOLE_COLOR_USER_INPUT,
134
- CONSOLE_COLOR_ERROR
135
- };
136
-
137
- struct console_state {
138
- bool multiline_input = false;
139
- bool use_color = false;
140
- console_color_t color = CONSOLE_COLOR_DEFAULT;
141
-
142
- FILE* out = stdout;
143
- #if defined (_WIN32)
144
- void* hConsole;
145
- #else
146
- FILE* tty = nullptr;
147
- termios prev_state;
148
- #endif
149
- };
150
-
151
- void console_init(console_state & con_st);
152
- void console_cleanup(console_state & con_st);
153
- void console_set_color(console_state & con_st, console_color_t color);
154
- bool console_readline(console_state & con_st, std::string & line);
 
11
  #include <unordered_map>
12
  #include <tuple>
13
 
 
 
 
 
 
14
  //
15
  // CLI argument parsing
16
  //
 
65
  std::string lora_adapter = ""; // lora adapter path
66
  std::string lora_base = ""; // base model path for the lora adapter
67
 
68
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
69
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
70
+
71
+ bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
72
+ bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
73
  bool memory_f16 = true; // use f16 instead of f32 for memory kv
74
  bool random_prompt = false; // do not randomize prompt if none provided
75
  bool use_color = false; // use color to distinguish generations and inputs
 
80
  bool embedding = false; // get only sentence embedding
81
  bool interactive_first = false; // wait for user input immediately
82
  bool multiline_input = false; // reverse the usage of `\`
83
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
84
 
85
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
86
  bool instruct = false; // instruction mode (used for Alpaca models)
87
  bool penalize_nl = true; // consider newlines as a repeatable token
88
  bool perplexity = false; // compute perplexity over the prompt
 
89
  bool use_mmap = true; // use mmap for faster loads
90
  bool use_mlock = false; // use mlock to keep model in memory
91
  bool mem_test = false; // compute maximum memory usage
 
112
 
113
  std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
114
  struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/console.cpp ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "console.h"
2
+ #include <vector>
3
+ #include <iostream>
4
+
5
+ #if defined(_WIN32)
6
+ #define WIN32_LEAN_AND_MEAN
7
+ #ifndef NOMINMAX
8
+ #define NOMINMAX
9
+ #endif
10
+ #include <windows.h>
11
+ #include <fcntl.h>
12
+ #include <io.h>
13
+ #else
14
+ #include <climits>
15
+ #include <sys/ioctl.h>
16
+ #include <unistd.h>
17
+ #include <wchar.h>
18
+ #include <stdio.h>
19
+ #include <stdlib.h>
20
+ #include <signal.h>
21
+ #include <termios.h>
22
+ #endif
23
+
24
+ #define ANSI_COLOR_RED "\x1b[31m"
25
+ #define ANSI_COLOR_GREEN "\x1b[32m"
26
+ #define ANSI_COLOR_YELLOW "\x1b[33m"
27
+ #define ANSI_COLOR_BLUE "\x1b[34m"
28
+ #define ANSI_COLOR_MAGENTA "\x1b[35m"
29
+ #define ANSI_COLOR_CYAN "\x1b[36m"
30
+ #define ANSI_COLOR_RESET "\x1b[0m"
31
+ #define ANSI_BOLD "\x1b[1m"
32
+
33
+ namespace console {
34
+
35
+ //
36
+ // Console state
37
+ //
38
+
39
+ static bool advanced_display = false;
40
+ static bool simple_io = true;
41
+ static display_t current_display = reset;
42
+
43
+ static FILE* out = stdout;
44
+
45
+ #if defined (_WIN32)
46
+ static void* hConsole;
47
+ #else
48
+ static FILE* tty = nullptr;
49
+ static termios initial_state;
50
+ #endif
51
+
52
+ //
53
+ // Init and cleanup
54
+ //
55
+
56
+ void init(bool use_simple_io, bool use_advanced_display) {
57
+ advanced_display = use_advanced_display;
58
+ simple_io = use_simple_io;
59
+ #if defined(_WIN32)
60
+ // Windows-specific console initialization
61
+ DWORD dwMode = 0;
62
+ hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
63
+ if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
64
+ hConsole = GetStdHandle(STD_ERROR_HANDLE);
65
+ if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
66
+ hConsole = nullptr;
67
+ simple_io = true;
68
+ }
69
+ }
70
+ if (hConsole) {
71
+ // Enable ANSI colors on Windows 10+
72
+ if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
73
+ SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
74
+ }
75
+ // Set console output codepage to UTF8
76
+ SetConsoleOutputCP(CP_UTF8);
77
+ }
78
+ HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
79
+ if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
80
+ // Set console input codepage to UTF16
81
+ _setmode(_fileno(stdin), _O_WTEXT);
82
+
83
+ // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
84
+ if (simple_io) {
85
+ dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
86
+ } else {
87
+ dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
88
+ }
89
+ if (!SetConsoleMode(hConIn, dwMode)) {
90
+ simple_io = true;
91
+ }
92
+ }
93
+ #else
94
+ // POSIX-specific console initialization
95
+ if (!simple_io) {
96
+ struct termios new_termios;
97
+ tcgetattr(STDIN_FILENO, &initial_state);
98
+ new_termios = initial_state;
99
+ new_termios.c_lflag &= ~(ICANON | ECHO);
100
+ new_termios.c_cc[VMIN] = 1;
101
+ new_termios.c_cc[VTIME] = 0;
102
+ tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
103
+
104
+ tty = fopen("/dev/tty", "w+");
105
+ if (tty != nullptr) {
106
+ out = tty;
107
+ }
108
+ }
109
+
110
+ setlocale(LC_ALL, "");
111
+ #endif
112
+ }
113
+
114
+ void cleanup() {
115
+ // Reset console display
116
+ set_display(reset);
117
+
118
+ #if !defined(_WIN32)
119
+ // Restore settings on POSIX systems
120
+ if (!simple_io) {
121
+ if (tty != nullptr) {
122
+ out = stdout;
123
+ fclose(tty);
124
+ tty = nullptr;
125
+ }
126
+ tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
127
+ }
128
+ #endif
129
+ }
130
+
131
+ //
132
+ // Display and IO
133
+ //
134
+
135
+ // Keep track of current display and only emit ANSI code if it changes
136
+ void set_display(display_t display) {
137
+ if (advanced_display && current_display != display) {
138
+ fflush(stdout);
139
+ switch(display) {
140
+ case reset:
141
+ fprintf(out, ANSI_COLOR_RESET);
142
+ break;
143
+ case prompt:
144
+ fprintf(out, ANSI_COLOR_YELLOW);
145
+ break;
146
+ case user_input:
147
+ fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
148
+ break;
149
+ case error:
150
+ fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
151
+ }
152
+ current_display = display;
153
+ fflush(out);
154
+ }
155
+ }
156
+
157
+ char32_t getchar32() {
158
+ #if defined(_WIN32)
159
+ HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
160
+ wchar_t high_surrogate = 0;
161
+
162
+ while (true) {
163
+ INPUT_RECORD record;
164
+ DWORD count;
165
+ if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
166
+ return WEOF;
167
+ }
168
+
169
+ if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
170
+ wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
171
+ if (wc == 0) {
172
+ continue;
173
+ }
174
+
175
+ if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
176
+ high_surrogate = wc;
177
+ continue;
178
+ }
179
+ if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
180
+ if (high_surrogate != 0) { // Check if we have a high surrogate
181
+ return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
182
+ }
183
+ }
184
+
185
+ high_surrogate = 0; // Reset the high surrogate
186
+ return static_cast<char32_t>(wc);
187
+ }
188
+ }
189
+ #else
190
+ wchar_t wc = getwchar();
191
+ if (static_cast<wint_t>(wc) == WEOF) {
192
+ return WEOF;
193
+ }
194
+
195
+ #if WCHAR_MAX == 0xFFFF
196
+ if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
197
+ wchar_t low_surrogate = getwchar();
198
+ if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
199
+ return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
200
+ }
201
+ }
202
+ if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
203
+ return 0xFFFD; // Return the replacement character U+FFFD
204
+ }
205
+ #endif
206
+
207
+ return static_cast<char32_t>(wc);
208
+ #endif
209
+ }
210
+
211
+ void pop_cursor() {
212
+ #if defined(_WIN32)
213
+ if (hConsole != NULL) {
214
+ CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
215
+ GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
216
+
217
+ COORD newCursorPosition = bufferInfo.dwCursorPosition;
218
+ if (newCursorPosition.X == 0) {
219
+ newCursorPosition.X = bufferInfo.dwSize.X - 1;
220
+ newCursorPosition.Y -= 1;
221
+ } else {
222
+ newCursorPosition.X -= 1;
223
+ }
224
+
225
+ SetConsoleCursorPosition(hConsole, newCursorPosition);
226
+ return;
227
+ }
228
+ #endif
229
+ putc('\b', out);
230
+ }
231
+
232
+ int estimateWidth(char32_t codepoint) {
233
+ #if defined(_WIN32)
234
+ return 1;
235
+ #else
236
+ return wcwidth(codepoint);
237
+ #endif
238
+ }
239
+
240
+ int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
241
+ #if defined(_WIN32)
242
+ CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
243
+ if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
244
+ // go with the default
245
+ return expectedWidth;
246
+ }
247
+ COORD initialPosition = bufferInfo.dwCursorPosition;
248
+ DWORD nNumberOfChars = length;
249
+ WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
250
+
251
+ CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
252
+ GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
253
+
254
+ // Figure out our real position if we're in the last column
255
+ if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
256
+ DWORD nNumberOfChars;
257
+ WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
258
+ GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
259
+ }
260
+
261
+ int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
262
+ if (width < 0) {
263
+ width += newBufferInfo.dwSize.X;
264
+ }
265
+ return width;
266
+ #else
267
+ // We can trust expectedWidth if we've got one
268
+ if (expectedWidth >= 0 || tty == nullptr) {
269
+ fwrite(utf8_codepoint, length, 1, out);
270
+ return expectedWidth;
271
+ }
272
+
273
+ fputs("\033[6n", tty); // Query cursor position
274
+ int x1;
275
+ int y1;
276
+ int x2;
277
+ int y2;
278
+ int results = 0;
279
+ results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
280
+
281
+ fwrite(utf8_codepoint, length, 1, tty);
282
+
283
+ fputs("\033[6n", tty); // Query cursor position
284
+ results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
285
+
286
+ if (results != 4) {
287
+ return expectedWidth;
288
+ }
289
+
290
+ int width = x2 - x1;
291
+ if (width < 0) {
292
+ // Calculate the width considering text wrapping
293
+ struct winsize w;
294
+ ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
295
+ width += w.ws_col;
296
+ }
297
+ return width;
298
+ #endif
299
+ }
300
+
301
+ void replace_last(char ch) {
302
+ #if defined(_WIN32)
303
+ pop_cursor();
304
+ put_codepoint(&ch, 1, 1);
305
+ #else
306
+ fprintf(out, "\b%c", ch);
307
+ #endif
308
+ }
309
+
310
+ void append_utf8(char32_t ch, std::string & out) {
311
+ if (ch <= 0x7F) {
312
+ out.push_back(static_cast<unsigned char>(ch));
313
+ } else if (ch <= 0x7FF) {
314
+ out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
315
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
316
+ } else if (ch <= 0xFFFF) {
317
+ out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
318
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
319
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
320
+ } else if (ch <= 0x10FFFF) {
321
+ out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
322
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
323
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
324
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
325
+ } else {
326
+ // Invalid Unicode code point
327
+ }
328
+ }
329
+
330
+ // Helper function to remove the last UTF-8 character from a string
331
+ void pop_back_utf8_char(std::string & line) {
332
+ if (line.empty()) {
333
+ return;
334
+ }
335
+
336
+ size_t pos = line.length() - 1;
337
+
338
+ // Find the start of the last UTF-8 character (checking up to 4 bytes back)
339
+ for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
340
+ if ((line[pos] & 0xC0) != 0x80) {
341
+ break; // Found the start of the character
342
+ }
343
+ }
344
+ line.erase(pos);
345
+ }
346
+
347
+ bool readline_advanced(std::string & line, bool multiline_input) {
348
+ if (out != stdout) {
349
+ fflush(stdout);
350
+ }
351
+
352
+ line.clear();
353
+ std::vector<int> widths;
354
+ bool is_special_char = false;
355
+ bool end_of_stream = false;
356
+
357
+ char32_t input_char;
358
+ while (true) {
359
+ fflush(out); // Ensure all output is displayed before waiting for input
360
+ input_char = getchar32();
361
+
362
+ if (input_char == '\r' || input_char == '\n') {
363
+ break;
364
+ }
365
+
366
+ if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
367
+ end_of_stream = true;
368
+ break;
369
+ }
370
+
371
+ if (is_special_char) {
372
+ set_display(user_input);
373
+ replace_last(line.back());
374
+ is_special_char = false;
375
+ }
376
+
377
+ if (input_char == '\033') { // Escape sequence
378
+ char32_t code = getchar32();
379
+ if (code == '[' || code == 0x1B) {
380
+ // Discard the rest of the escape sequence
381
+ while ((code = getchar32()) != (char32_t) WEOF) {
382
+ if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
383
+ break;
384
+ }
385
+ }
386
+ }
387
+ } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
388
+ if (!widths.empty()) {
389
+ int count;
390
+ do {
391
+ count = widths.back();
392
+ widths.pop_back();
393
+ // Move cursor back, print space, and move cursor back again
394
+ for (int i = 0; i < count; i++) {
395
+ replace_last(' ');
396
+ pop_cursor();
397
+ }
398
+ pop_back_utf8_char(line);
399
+ } while (count == 0 && !widths.empty());
400
+ }
401
+ } else {
402
+ int offset = line.length();
403
+ append_utf8(input_char, line);
404
+ int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
405
+ if (width < 0) {
406
+ width = 0;
407
+ }
408
+ widths.push_back(width);
409
+ }
410
+
411
+ if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
412
+ set_display(prompt);
413
+ replace_last(line.back());
414
+ is_special_char = true;
415
+ }
416
+ }
417
+
418
+ bool has_more = multiline_input;
419
+ if (is_special_char) {
420
+ replace_last(' ');
421
+ pop_cursor();
422
+
423
+ char last = line.back();
424
+ line.pop_back();
425
+ if (last == '\\') {
426
+ line += '\n';
427
+ fputc('\n', out);
428
+ has_more = !has_more;
429
+ } else {
430
+ // llama will just eat the single space, it won't act as a space
431
+ if (line.length() == 1 && line.back() == ' ') {
432
+ line.clear();
433
+ pop_cursor();
434
+ }
435
+ has_more = false;
436
+ }
437
+ } else {
438
+ if (end_of_stream) {
439
+ has_more = false;
440
+ } else {
441
+ line += '\n';
442
+ fputc('\n', out);
443
+ }
444
+ }
445
+
446
+ fflush(out);
447
+ return has_more;
448
+ }
449
+
450
+ bool readline_simple(std::string & line, bool multiline_input) {
451
+ #if defined(_WIN32)
452
+ std::wstring wline;
453
+ if (!std::getline(std::wcin, wline)) {
454
+ // Input stream is bad or EOF received
455
+ line.clear();
456
+ GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
457
+ return false;
458
+ }
459
+
460
+ int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
461
+ line.resize(size_needed);
462
+ WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
463
+ #else
464
+ if (!std::getline(std::cin, line)) {
465
+ // Input stream is bad or EOF received
466
+ line.clear();
467
+ return false;
468
+ }
469
+ #endif
470
+ if (!line.empty()) {
471
+ char last = line.back();
472
+ if (last == '/') { // Always return control on '/' symbol
473
+ line.pop_back();
474
+ return false;
475
+ }
476
+ if (last == '\\') { // '\\' changes the default action
477
+ line.pop_back();
478
+ multiline_input = !multiline_input;
479
+ }
480
+ }
481
+ line += '\n';
482
+
483
+ // By default, continue input if multiline_input is set
484
+ return multiline_input;
485
+ }
486
+
487
+ bool readline(std::string & line, bool multiline_input) {
488
+ set_display(user_input);
489
+
490
+ if (simple_io) {
491
+ return readline_simple(line, multiline_input);
492
+ }
493
+ return readline_advanced(line, multiline_input);
494
+ }
495
+
496
+ }
examples/console.h ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Console functions
2
+
3
+ #pragma once
4
+
5
+ #include <string>
6
+
7
+ namespace console {
8
+ enum display_t {
9
+ reset = 0,
10
+ prompt,
11
+ user_input,
12
+ error
13
+ };
14
+
15
+ void init(bool use_simple_io, bool use_advanced_display);
16
+ void cleanup();
17
+ void set_display(display_t display);
18
+ bool readline(std::string & line, bool multiline_input);
19
+ }
examples/embd-input/embd-input-lib.cpp CHANGED
@@ -30,7 +30,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
30
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
31
 
32
  if (params.seed == LLAMA_DEFAULT_SEED) {
33
- params.seed = time(NULL);
34
  }
35
  fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
36
 
 
30
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
31
 
32
  if (params.seed == LLAMA_DEFAULT_SEED) {
33
+ params.seed = uint32_t(time(NULL));
34
  }
35
  fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
36
 
examples/grammar-parser.cpp CHANGED
@@ -405,7 +405,7 @@ namespace grammar_parser {
405
  for (size_t i = 0, end = state.rules.size(); i < end; i++) {
406
  // fprintf(file, "%zu: ", i);
407
  // print_rule_binary(file, state.rules[i]);
408
- print_rule(file, i, state.rules[i], symbol_id_names);
409
  // fprintf(file, "\n");
410
  }
411
  } catch (const std::exception & err) {
 
405
  for (size_t i = 0, end = state.rules.size(); i < end; i++) {
406
  // fprintf(file, "%zu: ", i);
407
  // print_rule_binary(file, state.rules[i]);
408
+ print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
409
  // fprintf(file, "\n");
410
  }
411
  } catch (const std::exception & err) {
examples/json-schema-to-grammar.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import re
4
+ import sys
5
+
6
+ # whitespace is constrained to a single space char to prevent model "running away" in
7
+ # whitespace. Also maybe improves generation quality?
8
+ SPACE_RULE = '" "?'
9
+
10
+ PRIMITIVE_RULES = {
11
+ 'boolean': '("true" | "false") space',
12
+ 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
13
+ 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
14
+ 'string': r''' "\"" (
15
+ [^"\\] |
16
+ "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
17
+ )* "\"" space ''',
18
+ 'null': '"null" space',
19
+ }
20
+
21
+ INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
22
+ GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
23
+ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
24
+
25
+
26
+ class SchemaConverter:
27
+ def __init__(self, prop_order):
28
+ self._prop_order = prop_order
29
+ self._rules = {'space': SPACE_RULE}
30
+
31
+ def _format_literal(self, literal):
32
+ escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
33
+ lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
34
+ )
35
+ return f'"{escaped}"'
36
+
37
+ def _add_rule(self, name, rule):
38
+ esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
39
+ if esc_name not in self._rules or self._rules[esc_name] == rule:
40
+ key = esc_name
41
+ else:
42
+ i = 0
43
+ while f'{esc_name}{i}' in self._rules:
44
+ i += 1
45
+ key = f'{esc_name}{i}'
46
+ self._rules[key] = rule
47
+ return key
48
+
49
+ def visit(self, schema, name):
50
+ schema_type = schema.get('type')
51
+ rule_name = name or 'root'
52
+
53
+ if 'oneOf' in schema or 'anyOf' in schema:
54
+ rule = ' | '.join((
55
+ self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
56
+ for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
57
+ ))
58
+ return self._add_rule(rule_name, rule)
59
+
60
+ elif 'const' in schema:
61
+ return self._add_rule(rule_name, self._format_literal(schema['const']))
62
+
63
+ elif 'enum' in schema:
64
+ rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
65
+ return self._add_rule(rule_name, rule)
66
+
67
+ elif schema_type == 'object' and 'properties' in schema:
68
+ # TODO: `required` keyword
69
+ prop_order = self._prop_order
70
+ prop_pairs = sorted(
71
+ schema['properties'].items(),
72
+ # sort by position in prop_order (if specified) then by key
73
+ key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
74
+ )
75
+
76
+ rule = '"{" space'
77
+ for i, (prop_name, prop_schema) in enumerate(prop_pairs):
78
+ prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
79
+ if i > 0:
80
+ rule += ' "," space'
81
+ rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
82
+ rule += ' "}" space'
83
+
84
+ return self._add_rule(rule_name, rule)
85
+
86
+ elif schema_type == 'array' and 'items' in schema:
87
+ # TODO `prefixItems` keyword
88
+ item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
89
+ rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
90
+ return self._add_rule(rule_name, rule)
91
+
92
+ else:
93
+ assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
94
+ return self._add_rule(
95
+ 'root' if rule_name == 'root' else schema_type,
96
+ PRIMITIVE_RULES[schema_type]
97
+ )
98
+
99
+ def format_grammar(self):
100
+ return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
101
+
102
+
103
+ def main(args_in = None):
104
+ parser = argparse.ArgumentParser(
105
+ description='''
106
+ Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
107
+ given JSON schema. Only a subset of JSON schema features are supported; more may be
108
+ added in the future.
109
+ ''',
110
+ )
111
+ parser.add_argument(
112
+ '--prop-order',
113
+ default=[],
114
+ type=lambda s: s.split(','),
115
+ help='''
116
+ comma-separated property names defining the order of precedence for object properties;
117
+ properties not specified here are given lower precedence than those that are, and are
118
+ sorted alphabetically
119
+ '''
120
+ )
121
+ parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
122
+ args = parser.parse_args(args_in)
123
+
124
+ schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
125
+ prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
126
+ converter = SchemaConverter(prop_order)
127
+ converter.visit(schema, '')
128
+ print(converter.format_grammar())
129
+
130
+
131
+ if __name__ == '__main__':
132
+ main()
examples/llama.vim ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ " Requires an already running llama.cpp server
2
+ " To install either copy or symlink to ~/.vim/autoload/llama.vim
3
+ " Then start with either :call llama#doLlamaGen(),
4
+ " or add a keybind to your vimrc such as
5
+ " nnoremap Z :call llama#doLlamaGen()<CR>
6
+ " Similarly, you could add an insert mode keybind with
7
+ " inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
8
+ "
9
+ " g:llama_api_url and g:llama_overrides can be configured in your .vimrc
10
+ " let g:llama_api_url = "192.168.1.10:8080"
11
+ " llama_overrides can also be set through buffer/window scopes. For instance
12
+ " autocmd filetype python let b:llama_overrides = {"temp": 0.2}
13
+ " Could be added to your .vimrc to automatically set a lower temperature when
14
+ " editing a python script
15
+ " Additionally, an override dict can be stored at the top of a file
16
+ " !*{"stop": ["User:"]}
17
+ " Could be added to the start of your chatlog.txt to set the stopping token
18
+ " These parameter dicts are merged together from lowest to highest priority:
19
+ " server default -> g:llama_overrides -> w:llama_overrides ->
20
+ " b:llama_overrides -> in file (!*) overrides
21
+ "
22
+ " Sublists (like logit_bias and stop) are overridden, not merged
23
+ " Example override:
24
+ " !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
25
+ if !exists("g:llama_api_url")
26
+ let g:llama_api_url= "127.0.0.1:8080"
27
+ endif
28
+ if !exists("g:llama_overrides")
29
+ let g:llama_overrides = {}
30
+ endif
31
+ const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
32
+ const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
33
+ let s:linedict = {}
34
+
35
+ func s:callbackHandler(bufn, channel, msg)
36
+ if len(a:msg) < 3
37
+ return
38
+ elseif a:msg[0] == "d"
39
+ let l:msg = a:msg[6:-1]
40
+ else
41
+ let l:msg = a:msg
42
+ endif
43
+ let l:decoded_msg = json_decode(l:msg)
44
+ let l:newtext = split(l:decoded_msg['content'], "\n", 1)
45
+ if len(l:newtext) > 0
46
+ call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
47
+ else
48
+ echo "nothing genned"
49
+ endif
50
+ if len(newtext) > 1
51
+ let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
52
+ let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
53
+ endif
54
+ if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
55
+ echo "Finished generation"
56
+ endif
57
+ endfunction
58
+
59
+ func llama#doLlamaGen()
60
+ if exists("b:job")
61
+ if job_status(b:job) == "run"
62
+ call job_stop(b:job)
63
+ return
64
+ endif
65
+ endif
66
+
67
+ let l:cbuffer = bufnr("%")
68
+ let s:linedict[l:cbuffer] = line('$')
69
+ let l:buflines = getbufline(l:cbuffer, 1, 1000)
70
+ let l:querydata = copy(s:querydata)
71
+ call extend(l:querydata, g:llama_overrides)
72
+ if exists("w:llama_overrides")
73
+ call extend(l:querydata, w:llama_overrides)
74
+ endif
75
+ if exists("b:llama_overrides")
76
+ call extend(l:querydata, b:llama_overrides)
77
+ endif
78
+ if l:buflines[0][0:1] == '!*'
79
+ let l:userdata = json_decode(l:buflines[0][2:-1])
80
+ call extend(l:querydata, l:userdata)
81
+ let l:buflines = l:buflines[1:-1]
82
+ endif
83
+ let l:querydata.prompt = join(l:buflines, "\n")
84
+ let l:curlcommand = copy(s:curlcommand)
85
+ let l:curlcommand[2] = json_encode(l:querydata)
86
+ let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
87
+ endfunction
88
+
89
+ " Echos the tokkenization of the provided string , or cursor to end of word
90
+ " Onus is placed on the user to include the preceding space
91
+ func llama#tokenizeWord(...)
92
+ if (a:0 > 0)
93
+ let l:input = a:1
94
+ else
95
+ exe "normal \"*ye"
96
+ let l:input = @*
97
+ endif
98
+ let l:querydata = {"content": l:input}
99
+ let l:curlcommand = copy(s:curlcommand)
100
+ let l:curlcommand[2] = json_encode(l:querydata)
101
+ let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
102
+ let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
103
+ endfunction
104
+
105
+ func s:tokenizeWordCallback(plaintext, channel, msg)
106
+ echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
107
+ endfunction
108
+
109
+
110
+ " Echos the token count of the entire buffer (or provided string)
111
+ " Example usage :echo llama#tokenCount()
112
+ func llama#tokenCount(...)
113
+ if (a:0 > 0)
114
+ let l:buflines = a:1
115
+ else
116
+ let l:buflines = getline(1,1000)
117
+ if l:buflines[0][0:1] == '!*'
118
+ let l:buflines = l:buflines[1:-1]
119
+ endif
120
+ let l:buflines = join(l:buflines, "\n")
121
+ endif
122
+ let l:querydata = {"content": l:buflines}
123
+ let l:curlcommand = copy(s:curlcommand)
124
+ let l:curlcommand[2] = json_encode(l:querydata)
125
+ let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
126
+ let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
127
+ endfunction
128
+
129
+ func s:tokenCountCallback(channel, msg)
130
+ let resp = json_decode(a:msg)
131
+ echo len(resp.tokens)
132
+ endfunction
examples/llm.vim CHANGED
@@ -1,3 +1,5 @@
 
 
1
  function! Llm()
2
 
3
  let url = "http://127.0.0.1:8080/completion"
@@ -16,8 +18,10 @@ function! Llm()
16
  " Extract the content field from the response
17
  let content = json_decode(response).content
18
 
 
 
19
  " Insert the content at the cursor position
20
- call setline(line('.'), getline('.') . content)
21
  endfunction
22
 
23
  command! Llm call Llm()
 
1
+ " Basic plugin example
2
+
3
  function! Llm()
4
 
5
  let url = "http://127.0.0.1:8080/completion"
 
18
  " Extract the content field from the response
19
  let content = json_decode(response).content
20
 
21
+ let split_newlines = split(content, '\n', 1)
22
+
23
  " Insert the content at the cursor position
24
+ call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
25
  endfunction
26
 
27
  command! Llm call Llm()
examples/main/README.md CHANGED
@@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
140
 
141
  - `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
142
 
 
 
 
 
 
 
143
  ### Keep Prompt
144
 
145
  The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
@@ -202,9 +208,9 @@ Example usage: `--top-p 0.95`
202
 
203
  - `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
204
 
205
- Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
206
 
207
- Example usage: `--tfs 2.0`
208
 
209
  ### Locally Typical Sampling
210
 
 
140
 
141
  - `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
142
 
143
+ ### Extended Context Size
144
+
145
+ Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
146
+
147
+ - `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
148
+
149
  ### Keep Prompt
150
 
151
  The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
 
208
 
209
  - `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
210
 
211
+ Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.
212
 
213
+ Example usage: `--tfs 0.95`
214
 
215
  ### Locally Typical Sampling
216
 
examples/main/main.cpp CHANGED
@@ -4,6 +4,7 @@
4
  #endif
5
 
6
  #include "common.h"
 
7
  #include "llama.h"
8
  #include "build-info.h"
9
  #include "grammar-parser.h"
@@ -35,9 +36,7 @@
35
  #pragma warning(disable: 4244 4267) // possible loss of data
36
  #endif
37
 
38
- static console_state con_st;
39
  static llama_context ** g_ctx;
40
-
41
  static bool is_interacting = false;
42
 
43
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -46,7 +45,7 @@ void sigint_handler(int signo) {
46
  if (!is_interacting) {
47
  is_interacting=true;
48
  } else {
49
- console_cleanup(con_st);
50
  printf("\n");
51
  llama_print_timings(*g_ctx);
52
  _exit(130);
@@ -64,10 +63,8 @@ int main(int argc, char ** argv) {
64
 
65
  // save choice to use color for later
66
  // (note for later: this is a slightly awkward choice)
67
- con_st.use_color = params.use_color;
68
- con_st.multiline_input = params.multiline_input;
69
- console_init(con_st);
70
- atexit([]() { console_cleanup(con_st); });
71
 
72
  if (params.perplexity) {
73
  printf("\n************\n");
@@ -373,7 +370,7 @@ int main(int argc, char ** argv) {
373
 
374
  if (params.interactive) {
375
  const char *control_message;
376
- if (con_st.multiline_input) {
377
  control_message = " - To return control to LLaMa, end your input with '\\'.\n"
378
  " - To return control without starting a new line, end your input with '/'.\n";
379
  } else {
@@ -401,7 +398,7 @@ int main(int argc, char ** argv) {
401
  int n_past_guidance = 0;
402
 
403
  // the first thing we will do is to output the prompt, so set color accordingly
404
- console_set_color(con_st, CONSOLE_COLOR_PROMPT);
405
 
406
  std::vector<llama_token> embd;
407
  std::vector<llama_token> embd_guidance;
@@ -422,9 +419,9 @@ int main(int argc, char ** argv) {
422
  // Ensure the input doesn't exceed the context size by truncating embd if necessary.
423
  if ((int)embd.size() > max_embd_size) {
424
  auto skipped_tokens = embd.size() - max_embd_size;
425
- console_set_color(con_st, CONSOLE_COLOR_ERROR);
426
  printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
427
- console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
428
  fflush(stdout);
429
  embd.resize(max_embd_size);
430
  }
@@ -667,7 +664,7 @@ int main(int argc, char ** argv) {
667
  }
668
  // reset color to default if we there is no pending user input
669
  if (input_echo && (int)embd_inp.size() == n_consumed) {
670
- console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
671
  }
672
 
673
  // if not currently processing queued inputs;
@@ -693,7 +690,7 @@ int main(int argc, char ** argv) {
693
  if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
694
  if (params.interactive) {
695
  is_interacting = true;
696
- console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
697
  }
698
  is_antiprompt = true;
699
  fflush(stdout);
@@ -714,7 +711,7 @@ int main(int argc, char ** argv) {
714
 
715
  is_interacting = true;
716
  printf("\n");
717
- console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
718
  fflush(stdout);
719
  } else if (params.instruct) {
720
  is_interacting = true;
@@ -739,12 +736,12 @@ int main(int argc, char ** argv) {
739
  std::string line;
740
  bool another_line = true;
741
  do {
742
- another_line = console_readline(con_st, line);
743
  buffer += line;
744
  } while (another_line);
745
 
746
  // done taking input, reset color
747
- console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
748
 
749
  // Add tokens to embd only if the input buffer is non-empty
750
  // Entering a empty line lets the user pass control back
 
4
  #endif
5
 
6
  #include "common.h"
7
+ #include "console.h"
8
  #include "llama.h"
9
  #include "build-info.h"
10
  #include "grammar-parser.h"
 
36
  #pragma warning(disable: 4244 4267) // possible loss of data
37
  #endif
38
 
 
39
  static llama_context ** g_ctx;
 
40
  static bool is_interacting = false;
41
 
42
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 
45
  if (!is_interacting) {
46
  is_interacting=true;
47
  } else {
48
+ console::cleanup();
49
  printf("\n");
50
  llama_print_timings(*g_ctx);
51
  _exit(130);
 
63
 
64
  // save choice to use color for later
65
  // (note for later: this is a slightly awkward choice)
66
+ console::init(params.simple_io, params.use_color);
67
+ atexit([]() { console::cleanup(); });
 
 
68
 
69
  if (params.perplexity) {
70
  printf("\n************\n");
 
370
 
371
  if (params.interactive) {
372
  const char *control_message;
373
+ if (params.multiline_input) {
374
  control_message = " - To return control to LLaMa, end your input with '\\'.\n"
375
  " - To return control without starting a new line, end your input with '/'.\n";
376
  } else {
 
398
  int n_past_guidance = 0;
399
 
400
  // the first thing we will do is to output the prompt, so set color accordingly
401
+ console::set_display(console::prompt);
402
 
403
  std::vector<llama_token> embd;
404
  std::vector<llama_token> embd_guidance;
 
419
  // Ensure the input doesn't exceed the context size by truncating embd if necessary.
420
  if ((int)embd.size() > max_embd_size) {
421
  auto skipped_tokens = embd.size() - max_embd_size;
422
+ console::set_display(console::error);
423
  printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
424
+ console::set_display(console::reset);
425
  fflush(stdout);
426
  embd.resize(max_embd_size);
427
  }
 
664
  }
665
  // reset color to default if we there is no pending user input
666
  if (input_echo && (int)embd_inp.size() == n_consumed) {
667
+ console::set_display(console::reset);
668
  }
669
 
670
  // if not currently processing queued inputs;
 
690
  if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
691
  if (params.interactive) {
692
  is_interacting = true;
693
+ console::set_display(console::user_input);
694
  }
695
  is_antiprompt = true;
696
  fflush(stdout);
 
711
 
712
  is_interacting = true;
713
  printf("\n");
714
+ console::set_display(console::user_input);
715
  fflush(stdout);
716
  } else if (params.instruct) {
717
  is_interacting = true;
 
736
  std::string line;
737
  bool another_line = true;
738
  do {
739
+ another_line = console::readline(line, params.multiline_input);
740
  buffer += line;
741
  } while (another_line);
742
 
743
  // done taking input, reset color
744
+ console::set_display(console::reset);
745
 
746
  // Add tokens to embd only if the input buffer is non-empty
747
  // Entering a empty line lets the user pass control back
examples/perplexity/perplexity.cpp CHANGED
@@ -121,8 +121,23 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
121
  printf("\n");
122
  }
123
 
124
- void perplexity_lines(llama_context * ctx, const gpt_params & params) {
125
- // Calculates perplexity over each line of the prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  std::vector<std::string> prompt_lines;
128
  std::istringstream strstream(params.prompt);
@@ -132,63 +147,149 @@ void perplexity_lines(llama_context * ctx, const gpt_params & params) {
132
  prompt_lines.push_back(line);
133
  }
134
 
135
- const int n_vocab = llama_n_vocab(ctx);
 
 
 
136
 
137
- int counttotal = 0;
138
- size_t n_lines = prompt_lines.size();
139
 
140
- double nll = 0.0;
 
 
 
 
 
 
141
 
142
- fprintf(stderr, "%s: calculating perplexity over %lu lines\n", __func__, n_lines);
 
143
 
144
- printf("\nLine\tPPL line\tPPL cumulative\n");
 
145
 
146
- for (size_t i = 0; i < n_lines; ++i) {
 
 
 
 
 
 
 
147
 
148
- // Tokenize and insert BOS at start
149
- std::vector<int> batch_embd = ::llama_tokenize(ctx, prompt_lines[i], true);
150
 
151
- size_t batch_size = batch_embd.size();
 
 
 
152
 
153
- // Stop if line is too long
154
- if( batch_size > (size_t)params.n_ctx ) {
155
- fprintf(stderr, "%s : tokens in line %lu > n_ctxl\n", __func__, i);
156
- return;
157
  }
158
 
159
- if (llama_eval(ctx, batch_embd.data(), batch_size, 0, params.n_threads)) {
160
- fprintf(stderr, "%s : failed to eval\n", __func__);
161
- return;
 
162
  }
163
 
164
- const auto batch_logits = llama_get_logits(ctx);
165
- std::vector<float> logits;
166
- logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
 
 
167
 
168
- double nllline = 0.0;
169
- int countline = 0;
170
 
171
- // Perplexity over second half of the line
172
- for (size_t j = batch_size/2; j < batch_size - 1; ++j) {
173
- // Calculate probability of next token, given the previous ones.
174
- const std::vector<float> tok_logits(
175
- logits.begin() + (j + 0) * n_vocab,
176
- logits.begin() + (j + 1) * n_vocab);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- const float prob = softmax(tok_logits)[batch_embd[ j + 1]];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- nllline += -std::log(prob);
181
- ++countline;
 
 
 
 
 
 
182
  }
183
 
184
- nll += nllline;
185
- counttotal += countline;
186
 
187
- // perplexity is e^(average negative log-likelihood)
188
- printf("%lu\t%.8lf\t%.8lf\n", i + 1, std::exp(nllline/countline), std::exp(nll / counttotal) );
 
 
 
 
 
189
  fflush(stdout);
190
  }
191
 
 
 
192
  printf("\n");
193
  }
194
 
@@ -240,8 +341,8 @@ int main(int argc, char ** argv) {
240
  params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
241
  }
242
 
243
- if (params.perplexity_lines) {
244
- perplexity_lines(ctx, params);
245
  } else {
246
  perplexity(ctx, params);
247
  }
 
121
  printf("\n");
122
  }
123
 
124
+ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
125
+ // Calculates hellaswag score (acc_norm) from prompt
126
+ //
127
+ // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
128
+ // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68
129
+ //
130
+ // All 10042 tasks should be extracted to keep the results standardized like other implementations.
131
+ //
132
+ // Datafile layout:
133
+ // ['??'] denotes json fields
134
+ // 6 lines per task:
135
+ // ['activity_label'] + ": " +['ctx'] - The first part of the query, the context
136
+ // ['label'] - The index the best common sense ending aka gold ending
137
+ // ['endings'][0] - Endings added to the first part of the query
138
+ // ['endings'][1]
139
+ // ['endings'][2]
140
+ // ['endings'][3]
141
 
142
  std::vector<std::string> prompt_lines;
143
  std::istringstream strstream(params.prompt);
 
147
  prompt_lines.push_back(line);
148
  }
149
 
150
+ if( prompt_lines.size() % 6 != 0) {
151
+ fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
152
+ return;
153
+ }
154
 
155
+ size_t hs_task_count = prompt_lines.size()/6;
156
+ fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
157
 
158
+ // This is needed as usual for LLaMA models
159
+ bool prepend_bos = true;
160
+
161
+ // Number of tasks to use when computing the score
162
+ if ( params.hellaswag_tasks < hs_task_count ) {
163
+ hs_task_count = params.hellaswag_tasks;
164
+ }
165
 
166
+ // The tasks should be randomized so the score stabilizes quickly.
167
+ bool randomize_tasks = true;
168
 
169
+ // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
170
+ std::mt19937 rng(1);
171
 
172
+ // Dataholder for hellaswag tasks
173
+ struct hs_data_t {
174
+ std::string context;
175
+ size_t gold_ending_idx;
176
+ std::string ending[4];
177
+ size_t ending_logprob_count[4];
178
+ double ending_logprob[4];
179
+ };
180
 
181
+ fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
 
182
 
183
+ // Select and read data from prompt lines
184
+ hs_data_t *hs_data = new hs_data_t[hs_task_count];
185
+ for (size_t i=0; i < hs_task_count; i++) {
186
+ size_t idx = i;
187
 
188
+ // Select a random example of those left in the prompt
189
+ if (randomize_tasks) {
190
+ std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
191
+ idx = dist(rng);
192
  }
193
 
194
+ hs_data[i].context = prompt_lines[idx*6];
195
+ hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
196
+ for (size_t j=0; j < 4; j++) {
197
+ hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
198
  }
199
 
200
+ // Delete the selected random example from the prompt
201
+ if (randomize_tasks) {
202
+ prompt_lines.erase( std::next(prompt_lines.begin(),idx*6) , std::next(prompt_lines.begin(),idx*6+6) );
203
+ }
204
+ }
205
 
206
+ fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
207
+ printf("\ntask\tacc_norm\n");
208
 
209
+ double acc = 0.0f;
210
+ const int n_vocab = llama_n_vocab(ctx);
211
+
212
+ for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
213
+
214
+ // Tokenize the context to count tokens
215
+ std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
216
+ size_t context_size = context_embd.size();
217
+
218
+ for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
219
+
220
+ // Tokenize the query
221
+ std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
222
+ size_t query_size = query_embd.size();
223
+
224
+ // Stop if query wont fit the ctx window
225
+ if (query_size > (size_t)params.n_ctx) {
226
+ fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
227
+ return;
228
+ }
229
 
230
+ // Speedup small evaluations by evaluating atleast 32 tokens
231
+ if (query_size < 32) {
232
+ query_embd.resize(32);
233
+ }
234
+
235
+ // Evaluate the query
236
+ if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
237
+ fprintf(stderr, "%s : failed to eval\n", __func__);
238
+ return;
239
+ }
240
+
241
+ const auto query_logits = llama_get_logits(ctx);
242
+ std::vector<float> logits;
243
+ logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
244
+
245
+ hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
246
+ hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
247
+
248
+ // Calculate the logprobs over the ending
249
+ for (size_t j = context_size-1; j < query_size - 1; j++) {
250
+ // Calculate probability of next token, given the previous ones.
251
+ const std::vector<float> tok_logits(
252
+ logits.begin() + (j + 0) * n_vocab,
253
+ logits.begin() + (j + 1) * n_vocab);
254
+
255
+ const float prob = softmax(tok_logits)[query_embd[ j + 1]];
256
+
257
+ hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
258
+ hs_data[task_idx].ending_logprob_count[ending_idx]++;
259
+ }
260
+
261
+ // Calculate the mean token logprob for acc_norm
262
+ hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
263
+
264
+
265
+ // printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
266
+ // task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
267
+ }
268
 
269
+ // Find the ending with maximum logprob
270
+ size_t ending_logprob_max_idx = -1;
271
+ double ending_logprob_max_val = -INFINITY;
272
+ for (size_t j=0; j < 4; j++) {
273
+ if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
274
+ ending_logprob_max_idx = j;
275
+ ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];
276
+ }
277
  }
278
 
279
+ // printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);
 
280
 
281
+ // If the gold ending got the maximum logprobe add one accuracy point
282
+ if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
283
+ acc += 1.0;
284
+ }
285
+
286
+ // Print the accumulated accuracy mean x 100
287
+ printf("%zu\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
288
  fflush(stdout);
289
  }
290
 
291
+ delete [] hs_data;
292
+
293
  printf("\n");
294
  }
295
 
 
341
  params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
342
  }
343
 
344
+ if (params.hellaswag) {
345
+ hellaswag_score(ctx, params);
346
  } else {
347
  perplexity(ctx, params);
348
  }
examples/save-load-state/save-load-state.cpp CHANGED
@@ -26,6 +26,7 @@ int main(int argc, char ** argv) {
26
  auto lparams = llama_context_default_params();
27
 
28
  lparams.n_ctx = params.n_ctx;
 
29
  lparams.seed = params.seed;
30
  lparams.f16_kv = params.memory_f16;
31
  lparams.use_mmap = params.use_mmap;
 
26
  auto lparams = llama_context_default_params();
27
 
28
  lparams.n_ctx = params.n_ctx;
29
+ lparams.n_gqa = params.n_gqa;
30
  lparams.seed = params.seed;
31
  lparams.f16_kv = params.memory_f16;
32
  lparams.use_mmap = params.use_mmap;
examples/server-llama2-13B.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ cd "$(dirname "$0")/.." || exit
6
+
7
+ # Specify the model you want to use here:
8
+ MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
9
+ PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
10
+
11
+ # Adjust to the number of CPU cores you want to use.
12
+ N_THREAD="${N_THREAD:-12}"
13
+
14
+ # Note: you can also override the generation options by specifying them on the command line:
15
+ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
16
+
17
+
18
+ # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
19
+ ./server $GEN_OPTIONS \
20
+ --model "$MODEL" \
21
+ --threads "$N_THREAD" \
22
+ --rope-freq-scale 1.0 \
23
+ "$@"
24
+
25
+ # I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
26
+ # -ngl 1 \
examples/server/README.md CHANGED
@@ -151,6 +151,8 @@ node .
151
 
152
  `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
153
 
 
 
154
  `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
155
 
156
  `ignore_eos`: Ignore end of stream token and continue generating (default: false).
@@ -163,7 +165,7 @@ node .
163
 
164
  `content`: Set the text to tokenize.
165
 
166
- Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
167
 
168
  - **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
169
 
 
151
 
152
  `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
153
 
154
+ `grammar`: Set grammar for grammar-based sampling (default: no grammar)
155
+
156
  `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
157
 
158
  `ignore_eos`: Ignore end of stream token and continue generating (default: false).
 
165
 
166
  `content`: Set the text to tokenize.
167
 
168
+ Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
169
 
170
  - **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
171
 
examples/server/chat-llama2.sh ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ API_URL="${API_URL:-http://127.0.0.1:8080}"
4
+
5
+ CHAT=(
6
+ "Hello, Assistant."
7
+ "Hello. How may I help you today?"
8
+ )
9
+
10
+ INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
11
+
12
+ trim() {
13
+ shopt -s extglob
14
+ set -- "${1##+([[:space:]])}"
15
+ printf "%s" "${1%%+([[:space:]])}"
16
+ }
17
+
18
+ trim_trailing() {
19
+ shopt -s extglob
20
+ printf "%s" "${1%%+([[:space:]])}"
21
+ }
22
+
23
+ format_prompt() {
24
+ if [[ "${#CHAT[@]}" -eq 0 ]]; then
25
+ echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
26
+ else
27
+ LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
28
+ echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
29
+ fi
30
+ }
31
+
32
+ tokenize() {
33
+ curl \
34
+ --silent \
35
+ --request POST \
36
+ --url "${API_URL}/tokenize" \
37
+ --header "Content-Type: application/json" \
38
+ --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
39
+ | jq '.tokens[]'
40
+ }
41
+
42
+ N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
43
+
44
+ chat_completion() {
45
+ PROMPT="$(trim_trailing "$(format_prompt "$1")")"
46
+ DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
47
+ prompt: .,
48
+ temperature: 0.2,
49
+ top_k: 40,
50
+ top_p: 0.9,
51
+ n_keep: $n_keep,
52
+ n_predict: 1024,
53
+ stop: ["[INST]"],
54
+ stream: true
55
+ }')"
56
+
57
+ # Create a temporary file to hold the Python output
58
+ TEMPFILE=$(mktemp)
59
+
60
+ exec 3< <(curl \
61
+ --silent \
62
+ --no-buffer \
63
+ --request POST \
64
+ --url "${API_URL}/completion" \
65
+ --header "Content-Type: application/json" \
66
+ --data-raw "${DATA}")
67
+
68
+ python -c "
69
+ import json
70
+ import sys
71
+
72
+ answer = ''
73
+ while True:
74
+ line = sys.stdin.readline()
75
+ if not line:
76
+ break
77
+ if line.startswith('data: '):
78
+ json_content = line[6:].strip()
79
+ content = json.loads(json_content)['content']
80
+ sys.stdout.write(content)
81
+ sys.stdout.flush()
82
+ answer += content
83
+
84
+ answer = answer.rstrip('\n')
85
+
86
+ # Write the answer to the temporary file
87
+ with open('$TEMPFILE', 'w') as f:
88
+ f.write(answer)
89
+ " <&3
90
+
91
+ exec 3<&-
92
+
93
+ # Read the answer from the temporary file
94
+ ANSWER=$(cat $TEMPFILE)
95
+
96
+ # Clean up the temporary file
97
+ rm $TEMPFILE
98
+
99
+ printf "\n"
100
+
101
+ CHAT+=("$1" "$(trim "$ANSWER")")
102
+ }
103
+
104
+ while true; do
105
+ echo -en "\033[0;32m" # Green color
106
+ read -r -e -p "> " QUESTION
107
+ echo -en "\033[0m" # Reset color
108
+ chat_completion "${QUESTION}"
109
+ done
examples/server/completion.js.hpp CHANGED
@@ -87,289 +87,342 @@ unsigned char completion_js[] = {
87
  0x20, 0x54, 0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
88
  0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
89
  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b,
90
- 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20,
91
- 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d,
92
- 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
93
- 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29,
94
- 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
95
- 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20,
96
- 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72,
97
- 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
98
- 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
99
- 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
100
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b,
101
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
102
- 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x73, 0x65, 0x20, 0x61,
103
- 0x6e, 0x73, 0x77, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68,
104
- 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69,
105
- 0x70, 0x6c, 0x65, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66,
106
- 0x3a, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5c, 0x6e, 0x20, 0x77, 0x69,
107
- 0x74, 0x68, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x61, 0x6c, 0x77, 0x61,
108
- 0x79, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x74, 0x20, 0x61,
109
- 0x73, 0x20, 0x61, 0x20, 0x6b, 0x65, 0x79, 0x2e, 0x20, 0x69, 0x6e, 0x20,
110
- 0x6f, 0x75, 0x72, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x77, 0x65, 0x0a,
111
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x61, 0x69,
112
- 0x6e, 0x6c, 0x79, 0x20, 0x63, 0x61, 0x72, 0x65, 0x20, 0x61, 0x62, 0x6f,
113
- 0x75, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a,
114
- 0x20, 0x6b, 0x65, 0x79, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2c, 0x20, 0x77,
115
- 0x68, 0x69, 0x63, 0x68, 0x20, 0x77, 0x65, 0x20, 0x65, 0x78, 0x70, 0x65,
116
- 0x63, 0x74, 0x20, 0x61, 0x73, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x0a, 0x20,
117
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74,
118
- 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
119
- 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72, 0x65, 0x73,
120
- 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a,
121
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x61,
122
- 0x72, 0x73, 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20,
123
- 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61,
124
- 0x64, 0x64, 0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72,
125
- 0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
126
- 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20,
127
- 0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73,
128
- 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20,
129
- 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e,
130
- 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20,
131
- 0x74, 0x65, 0x78, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x41, 0x6c,
132
- 0x6c, 0x28, 0x72, 0x65, 0x67, 0x65, 0x78, 0x29, 0x29, 0x20, 0x7b, 0x0a,
133
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
134
- 0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d,
135
- 0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a,
136
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
137
- 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20,
138
- 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73,
139
- 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
140
- 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73,
141
- 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65,
142
- 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74,
143
- 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
144
- 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53,
145
- 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73,
146
- 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20,
147
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
148
- 0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
149
- 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
150
- 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79,
151
- 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79,
152
- 0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b,
153
- 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69,
154
- 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73,
155
- 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72,
156
- 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77,
157
- 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
158
- 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
159
- 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
160
- 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a,
161
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
162
- 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
163
- 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
164
- 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
165
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e,
166
- 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
167
- 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
168
- 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
169
- 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
170
- 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
171
- 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72,
172
- 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
173
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63,
174
- 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
175
- 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d,
176
- 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74,
177
- 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
178
- 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
179
- 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
180
- 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65,
181
- 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
182
- 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20,
183
- 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20,
184
- 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
185
- 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29,
186
- 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
187
- 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
188
- 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
189
- 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
190
- 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74,
191
- 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79,
192
- 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63, 0x72,
193
- 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
194
- 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f,
195
- 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
196
- 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65,
197
- 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66,
198
- 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
199
- 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a,
200
- 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
201
- 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
202
- 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
203
- 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
204
- 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
205
- 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
206
- 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28,
207
- 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
208
- 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
209
- 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
210
- 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e,
211
- 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
212
- 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70,
213
- 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
214
- 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
215
- 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
216
- 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
217
- 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20,
218
- 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63,
219
- 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
220
- 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
221
- 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29,
222
- 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
223
- 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
224
- 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
225
- 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72,
226
- 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73,
227
- 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c,
228
- 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
229
- 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
230
- 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
231
- 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
232
- 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
233
- 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
234
- 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
235
- 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20,
236
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
237
- 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
238
- 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
239
- 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
240
- 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
241
- 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
242
- 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29,
243
- 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
244
- 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
245
- 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
246
- 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
247
- 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
248
- 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
249
- 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
250
- 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
251
- 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e,
252
- 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
253
- 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74,
254
- 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
255
  0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
256
- 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
257
- 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
258
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
259
- 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
260
- 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
261
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
262
- 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63,
263
- 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43,
264
- 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22,
265
- 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20,
266
- 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e,
267
- 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
268
- 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
269
- 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
270
- 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
271
- 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
272
- 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
273
- 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e,
274
- 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
275
- 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
276
- 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28,
277
- 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
278
- 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b,
279
- 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
280
- 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
281
- 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
282
- 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65,
283
- 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d,
284
- 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
285
- 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e,
286
- 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73,
287
- 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a,
288
- 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
289
- 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c,
290
- 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70,
291
- 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
292
- 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
293
- 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
294
- 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
295
- 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
296
- 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
297
- 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f,
298
- 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
299
- 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
300
- 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
301
- 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d,
302
- 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
303
- 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74,
304
- 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
305
- 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
306
- 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
307
- 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
 
308
  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
309
  0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
310
  0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
311
- 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50,
312
- 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
313
- 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72,
314
- 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
315
- 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
316
- 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20,
317
- 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
318
- 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
319
- 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
320
- 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
321
- 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
322
- 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b,
323
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
324
- 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
325
- 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
326
- 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
327
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
328
- 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
329
- 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
330
- 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
331
- 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65,
332
- 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
333
- 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f,
334
- 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65,
335
- 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65,
336
- 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
337
- 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
338
- 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70,
339
- 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
340
- 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
341
- 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
342
- 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63,
343
- 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f,
344
- 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61,
345
- 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
346
- 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e,
347
- 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20,
348
- 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61,
349
- 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20,
350
- 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74,
351
- 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69,
352
- 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
353
- 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69,
354
- 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20,
355
- 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20,
356
- 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20,
357
- 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
358
- 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
359
- 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
360
- 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20,
361
- 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
362
- 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e,
363
- 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
364
- 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
366
- 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77,
367
- 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f,
368
- 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29,
369
- 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72,
370
- 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
371
- 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67,
372
- 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
373
- 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
374
  };
375
- unsigned int completion_js_len = 4462;
 
87
  0x20, 0x54, 0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
88
  0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
89
  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b,
90
+ 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
91
+ 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f,
92
+ 0x20, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20,
93
+ 0x70, 0x61, 0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65,
94
+ 0x61, 0x64, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20,
95
+ 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65,
96
+ 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75,
97
+ 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c,
98
+ 0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20,
99
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
100
+ 0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69,
101
+ 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61,
102
+ 0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
103
+ 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f,
104
+ 0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
105
+ 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20,
106
+ 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
107
+ 0x2f, 0x2f, 0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c,
108
+ 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61,
109
+ 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72,
110
+ 0x65, 0x6e, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66,
111
+ 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
112
+ 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d,
113
+ 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20,
114
+ 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f,
115
+ 0x64, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61,
116
+ 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
117
+ 0x20, 0x2f, 0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66,
118
+ 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68,
119
+ 0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61,
120
+ 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a,
121
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
122
+ 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
123
+ 0x42, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74,
124
+ 0x2e, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c,
125
+ 0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
126
+ 0x2f, 0x2f, 0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65,
127
+ 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c,
128
+ 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
129
+ 0x65, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74,
130
+ 0x65, 0x78, 0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c,
131
+ 0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
132
+ 0x2f, 0x2f, 0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65,
133
+ 0x78, 0x74, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65,
134
+ 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69,
135
+ 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68,
136
+ 0x65, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20,
137
+ 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f,
138
+ 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
139
+ 0x20, 0x2f, 0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74,
140
+ 0x20, 0x69, 0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
141
+ 0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64,
142
+ 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74,
143
+ 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61,
144
+ 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
145
+ 0x28, 0x21, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69,
146
+ 0x6e, 0x65, 0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20,
147
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
148
+ 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e,
149
+ 0x70, 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
150
+ 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20,
151
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76,
152
+ 0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20,
153
+ 0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76,
154
+ 0x65, 0x72, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76,
155
+ 0x65, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65,
156
+ 0x61, 0x6b, 0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e,
157
+ 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
158
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73,
159
+ 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76,
160
+ 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64,
161
+ 0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73,
162
+ 0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
163
+ 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20,
164
+ 0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e,
165
+ 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
166
+ 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74,
167
+ 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e,
168
+ 0x65, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
169
+ 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63,
170
+ 0x68, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78,
171
+ 0x65, 0x63, 0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
172
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61,
173
+ 0x74, 0x63, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
174
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b,
175
+ 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20,
176
+ 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20,
177
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69,
178
+ 0x6e, 0x63, 0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20,
179
+ 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
180
+ 0x61, 0x2e, 0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73,
181
+ 0x20, 0x6a, 0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
182
+ 0x20, 0x74, 0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e,
183
+ 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
184
+ 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
185
+ 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20,
186
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
187
+ 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d,
188
+ 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28,
189
+ 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29,
190
+ 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
191
+ 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d,
192
+ 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
193
+ 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20,
194
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
195
+ 0x2f, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20,
196
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c,
197
+ 0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20,
198
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
199
+ 0x2f, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20,
200
+ 0x61, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e,
201
+ 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72,
202
+ 0x2c, 0x20, 0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72,
203
+ 0x65, 0x61, 0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20,
204
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
205
+ 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
206
+ 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
207
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
208
+ 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61,
209
+ 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
210
+ 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20,
211
+ 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
212
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
213
+ 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
214
+ 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
216
+ 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b,
217
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
218
+ 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
219
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20,
220
+ 0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20,
221
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62,
222
+ 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
223
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
224
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
225
+ 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
226
+ 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20,
227
+ 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a,
228
+ 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61,
229
+ 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72,
230
+ 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20,
231
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
232
+ 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d,
233
+ 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20,
234
+ 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
235
+ 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20,
236
+ 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79,
237
+ 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
238
+ 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28,
239
+ 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65,
240
+ 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
241
+ 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
242
+ 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
243
+ 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20,
244
+ 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20,
245
+ 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63,
246
+ 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f,
247
+ 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f,
248
+ 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f,
249
+ 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76,
250
+ 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20,
251
+ 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c,
252
+ 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f,
253
+ 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
254
+ 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
255
+ 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
256
+ 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
257
+ 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45,
258
+ 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72,
259
+ 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
260
+ 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
261
+ 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63,
262
+ 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28,
263
+ 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
264
+ 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f,
265
+ 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78,
266
+ 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
267
+ 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
268
+ 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
269
  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
270
  0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
271
  0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
272
+ 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
273
+ 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20,
274
+ 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
275
+ 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20,
276
+ 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
277
+ 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
278
+ 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
279
+ 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e,
280
+ 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20,
281
+ 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
282
+ 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f,
283
+ 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
284
+ 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
285
+ 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
286
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
287
+ 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61,
288
+ 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
289
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e,
290
+ 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70,
291
+ 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65,
292
+ 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e,
293
+ 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c,
294
+ 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63,
295
+ 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29,
296
+ 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
297
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75,
298
+ 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
299
+ 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
300
+ 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
301
+ 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
302
+ 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
303
+ 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
304
+ 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65,
305
+ 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74,
306
+ 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
307
+ 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
308
+ 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
309
+ 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
310
+ 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
311
+ 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
312
+ 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
313
+ 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
314
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
315
+ 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
316
+ 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
317
+ 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
318
+ 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
319
+ 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
320
+ 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69,
321
+ 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
322
+ 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
323
+ 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
324
+ 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
325
+ 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
326
+ 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f,
327
+ 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69,
328
+ 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
329
+ 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29,
330
+ 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
331
+ 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
332
+ 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
333
+ 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
334
+ 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65,
335
+ 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
336
+ 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f,
337
+ 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74,
338
+ 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20,
339
+ 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20,
340
+ 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f,
341
+ 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a,
342
+ 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
343
+ 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
344
+ 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e,
345
+ 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d,
346
+ 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
347
+ 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72,
348
+ 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
349
+ 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f,
350
+ 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a,
351
+ 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
352
+ 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
353
+ 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
354
+ 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f,
355
+ 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
356
+ 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
357
+ 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
358
+ 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
359
+ 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f,
360
+ 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d,
361
+ 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
362
+ 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20,
363
+ 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
364
+ 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20,
365
+ 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e,
366
+ 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20,
367
+ 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
368
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
369
+ 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
370
+ 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
371
+ 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74,
372
+ 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e,
373
+ 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70,
374
+ 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
375
+ 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20,
376
+ 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
377
+ 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75,
378
+ 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
379
+ 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
380
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c,
381
+ 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b,
382
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68,
383
+ 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20,
384
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28,
385
+ 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
386
+ 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a,
387
+ 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72,
388
+ 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a,
389
+ 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
390
+ 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
391
+ 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
392
+ 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74,
393
+ 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c,
394
+ 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
395
+ 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
396
+ 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
397
+ 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72,
398
+ 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20,
399
+ 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f,
400
+ 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29,
401
+ 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
402
+ 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a,
403
+ 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65,
404
+ 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20,
405
+ 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68,
406
+ 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68,
407
+ 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c,
408
+ 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
409
+ 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
410
+ 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20,
411
+ 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72,
412
+ 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
413
+ 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d,
414
+ 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e,
415
+ 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65,
416
+ 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74,
417
+ 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
418
+ 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
419
+ 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61,
420
+ 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22,
421
+ 0x2f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22,
422
+ 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20,
423
+ 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20,
424
+ 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
425
  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
426
+ 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
 
 
 
 
 
 
 
427
  };
428
+ unsigned int completion_js_len = 5099;
examples/server/index.html.hpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/server/public/completion.js CHANGED
@@ -43,6 +43,7 @@ export async function* llama(prompt, params = {}, config = {}) {
43
  const decoder = new TextDecoder();
44
 
45
  let content = "";
 
46
 
47
  try {
48
  let cont = true;
@@ -53,29 +54,47 @@ export async function* llama(prompt, params = {}, config = {}) {
53
  break;
54
  }
55
 
56
- // sse answers in the form multiple lines of: value\n with data always present as a key. in our case we
57
- // mainly care about the data: key here, which we expect as json
58
- const text = decoder.decode(result.value);
59
 
60
- // parse all sse events and add them to result
61
- const regex = /^(\S+):\s(.*)$/gm;
62
- for (const match of text.matchAll(regex)) {
63
- result[match[1]] = match[2]
64
- }
65
 
66
- // since we know this is llama.cpp, let's just decode the json in data
67
- result.data = JSON.parse(result.data);
68
- content += result.data.content;
69
 
70
- // yield
71
- yield result;
 
 
 
 
 
72
 
73
- // if we got a stop token from server, we will break here
74
- if (result.data.stop) {
75
- if (result.data.generation_settings) {
76
- generation_settings = result.data.generation_settings;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
78
- break;
79
  }
80
  }
81
  } catch (e) {
 
43
  const decoder = new TextDecoder();
44
 
45
  let content = "";
46
+ let leftover = ""; // Buffer for partially read lines
47
 
48
  try {
49
  let cont = true;
 
54
  break;
55
  }
56
 
57
+ // Add any leftover data to the current chunk of data
58
+ const text = leftover + decoder.decode(result.value);
 
59
 
60
+ // Check if the last character is a line break
61
+ const endsWithLineBreak = text.endsWith('\n');
 
 
 
62
 
63
+ // Split the text into lines
64
+ let lines = text.split('\n');
 
65
 
66
+ // If the text doesn't end with a line break, then the last line is incomplete
67
+ // Store it in leftover to be added to the next chunk of data
68
+ if (!endsWithLineBreak) {
69
+ leftover = lines.pop();
70
+ } else {
71
+ leftover = ""; // Reset leftover if we have a line break at the end
72
+ }
73
 
74
+ // Parse all sse events and add them to result
75
+ const regex = /^(\S+):\s(.*)$/gm;
76
+ for (const line of lines) {
77
+ const match = regex.exec(line);
78
+ if (match) {
79
+ result[match[1]] = match[2]
80
+ // since we know this is llama.cpp, let's just decode the json in data
81
+ if (result.data) {
82
+ result.data = JSON.parse(result.data);
83
+ content += result.data.content;
84
+
85
+ // yield
86
+ yield result;
87
+
88
+ // if we got a stop token from server, we will break here
89
+ if (result.data.stop) {
90
+ if (result.data.generation_settings) {
91
+ generation_settings = result.data.generation_settings;
92
+ }
93
+ cont = false;
94
+ break;
95
+ }
96
+ }
97
  }
 
98
  }
99
  }
100
  } catch (e) {
examples/server/public/index.html CHANGED
@@ -3,12 +3,11 @@
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
 
6
  <title>llama.cpp - chat</title>
7
 
8
  <style>
9
  body {
10
- background-color: #fff;
11
- color: #000;
12
  font-family: system-ui;
13
  font-size: 90%;
14
  }
@@ -283,8 +282,9 @@
283
 
284
  useEffect(() => {
285
  // scroll to bottom (if needed)
286
- if (container.current && container.current.scrollHeight <= container.current.scrollTop + container.current.offsetHeight + 300) {
287
- container.current.scrollTo(0, container.current.scrollHeight)
 
288
  }
289
  }, [messages])
290
 
 
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
6
+ <meta name="color-scheme" content="light dark">
7
  <title>llama.cpp - chat</title>
8
 
9
  <style>
10
  body {
 
 
11
  font-family: system-ui;
12
  font-size: 90%;
13
  }
 
282
 
283
  useEffect(() => {
284
  // scroll to bottom (if needed)
285
+ const parent = container.current.parentElement;
286
+ if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
287
+ parent.scrollTo(0, parent.scrollHeight)
288
  }
289
  }, [messages])
290
 
examples/server/server.cpp CHANGED
@@ -1,6 +1,7 @@
1
  #include "common.h"
2
  #include "llama.h"
3
  #include "build-info.h"
 
4
 
5
  #ifndef NDEBUG
6
  // crash the server in debug mode, otherwise send an http 500 error
@@ -195,6 +196,8 @@ struct llama_server_context
195
  llama_context *ctx = nullptr;
196
  gpt_params params;
197
 
 
 
198
  bool truncated = false;
199
  bool stopped_eos = false;
200
  bool stopped_word = false;
@@ -226,6 +229,7 @@ struct llama_server_context
226
  void rewind()
227
  {
228
  params.antiprompt.clear();
 
229
  num_prompt_tokens = 0;
230
  num_tokens_predicted = 0;
231
  generated_text = "";
@@ -237,6 +241,7 @@ struct llama_server_context
237
  stopped_limit = false;
238
  stopping_word = "";
239
  multibyte_pending = 0;
 
240
 
241
  n_remain = 0;
242
  n_past = 0;
@@ -257,6 +262,33 @@ struct llama_server_context
257
  return true;
258
  }
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  void loadPrompt()
261
  {
262
  params.prompt.insert(0, 1, ' '); // always add a first space
@@ -420,6 +452,10 @@ struct llama_server_context
420
  logits[llama_token_nl()] = nl_logit;
421
  }
422
 
 
 
 
 
423
  if (temp <= 0)
424
  {
425
  // Greedy sampling
@@ -457,10 +493,15 @@ struct llama_server_context
457
  }
458
  }
459
 
 
 
 
 
460
  for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
461
  {
462
  result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
463
  }
 
464
  last_n_tokens.erase(last_n_tokens.begin());
465
  last_n_tokens.push_back(result.tok);
466
  num_tokens_predicted++;
@@ -631,6 +672,9 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
631
  fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
632
  fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
633
  fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
 
 
 
634
  #endif
635
  fprintf(stdout, " -m FNAME, --model FNAME\n");
636
  fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
@@ -827,7 +871,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
827
  }
828
  }
829
  #else
830
- LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
831
  #endif // GGML_USE_CUBLAS
832
  }
833
  else if (arg == "--low-vram" || arg == "-lv")
@@ -835,7 +879,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
835
  #ifdef GGML_USE_CUBLAS
836
  params.low_vram = true;
837
  #else
838
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 
 
 
 
 
 
 
 
839
  #endif // GGML_USE_CUBLAS
840
  }
841
  else if (arg == "--main-gpu" || arg == "-mg")
@@ -936,6 +988,7 @@ static json format_generation_settings(llama_server_context &llama)
936
  {"stream", llama.stream},
937
  {"logit_bias", llama.params.logit_bias},
938
  {"n_probs", llama.params.n_probs},
 
939
  };
940
  }
941
 
@@ -1037,6 +1090,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
1037
  llama.params.n_keep = body.value("n_keep", default_params.n_keep);
1038
  llama.params.seed = body.value("seed", default_params.seed);
1039
  llama.params.prompt = body.value("prompt", default_params.prompt);
 
1040
  llama.params.n_probs = body.value("n_probs", default_params.n_probs);
1041
 
1042
  llama.params.logit_bias.clear();
@@ -1168,6 +1222,12 @@ int main(int argc, char **argv)
1168
 
1169
  parse_options_completion(json::parse(req.body), llama);
1170
 
 
 
 
 
 
 
1171
  llama.loadPrompt();
1172
  llama.beginCompletion();
1173
 
@@ -1263,7 +1323,11 @@ int main(int argc, char **argv)
1263
  sink.done();
1264
  return true;
1265
  };
1266
- res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
 
 
 
 
1267
  } });
1268
 
1269
  svr.Get("/model.json", [&llama](const Request &, Response &res)
@@ -1319,8 +1383,12 @@ int main(int argc, char **argv)
1319
 
1320
  svr.set_error_handler([](const Request &, Response &res)
1321
  {
1322
- res.set_content("File Not Found", "text/plain");
1323
- res.status = 404; });
 
 
 
 
1324
 
1325
  // set timeouts and change hostname and port
1326
  svr.set_read_timeout(sparams.read_timeout);
@@ -1348,6 +1416,9 @@ int main(int argc, char **argv)
1348
  return 1;
1349
  }
1350
 
 
 
 
1351
  llama_backend_free();
1352
 
1353
  return 0;
 
1
  #include "common.h"
2
  #include "llama.h"
3
  #include "build-info.h"
4
+ #include "grammar-parser.h"
5
 
6
  #ifndef NDEBUG
7
  // crash the server in debug mode, otherwise send an http 500 error
 
196
  llama_context *ctx = nullptr;
197
  gpt_params params;
198
 
199
+ llama_grammar *grammar = nullptr;
200
+
201
  bool truncated = false;
202
  bool stopped_eos = false;
203
  bool stopped_word = false;
 
229
  void rewind()
230
  {
231
  params.antiprompt.clear();
232
+ params.grammar.clear();
233
  num_prompt_tokens = 0;
234
  num_tokens_predicted = 0;
235
  generated_text = "";
 
241
  stopped_limit = false;
242
  stopping_word = "";
243
  multibyte_pending = 0;
244
+ grammar = nullptr;
245
 
246
  n_remain = 0;
247
  n_past = 0;
 
262
  return true;
263
  }
264
 
265
+ bool loadGrammar()
266
+ {
267
+ if (!params.grammar.empty()) {
268
+ grammar_parser::parse_state parsed_grammar;
269
+
270
+ parsed_grammar = grammar_parser::parse(params.grammar.c_str());
271
+ // will be empty (default) if there are parse errors
272
+ if (parsed_grammar.rules.empty()) {
273
+ LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
274
+ return false;
275
+ }
276
+ grammar_parser::print_grammar(stderr, parsed_grammar);
277
+
278
+ {
279
+ auto it = params.logit_bias.find(llama_token_eos());
280
+ if (it != params.logit_bias.end() && it->second == -INFINITY) {
281
+ LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
282
+ }
283
+ }
284
+
285
+ std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
286
+ grammar = llama_grammar_init(
287
+ grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
288
+ }
289
+ return true;
290
+ }
291
+
292
  void loadPrompt()
293
  {
294
  params.prompt.insert(0, 1, ' '); // always add a first space
 
452
  logits[llama_token_nl()] = nl_logit;
453
  }
454
 
455
+ if (grammar != nullptr) {
456
+ llama_sample_grammar(ctx, &candidates_p, grammar);
457
+ }
458
+
459
  if (temp <= 0)
460
  {
461
  // Greedy sampling
 
493
  }
494
  }
495
 
496
+ if (grammar != nullptr) {
497
+ llama_grammar_accept_token(ctx, grammar, result.tok);
498
+ }
499
+
500
  for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
501
  {
502
  result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
503
  }
504
+
505
  last_n_tokens.erase(last_n_tokens.begin());
506
  last_n_tokens.push_back(result.tok);
507
  num_tokens_predicted++;
 
672
  fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
673
  fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
674
  fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
675
+ fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
676
+ fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
677
+ fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
678
  #endif
679
  fprintf(stdout, " -m FNAME, --model FNAME\n");
680
  fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
 
871
  }
872
  }
873
  #else
874
+ LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
875
  #endif // GGML_USE_CUBLAS
876
  }
877
  else if (arg == "--low-vram" || arg == "-lv")
 
879
  #ifdef GGML_USE_CUBLAS
880
  params.low_vram = true;
881
  #else
882
+ LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
883
+ #endif // GGML_USE_CUBLAS
884
+ }
885
+ else if (arg == "--mul-mat-q" || arg == "-mmq")
886
+ {
887
+ #ifdef GGML_USE_CUBLAS
888
+ params.mul_mat_q = true;
889
+ #else
890
+ LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
891
  #endif // GGML_USE_CUBLAS
892
  }
893
  else if (arg == "--main-gpu" || arg == "-mg")
 
988
  {"stream", llama.stream},
989
  {"logit_bias", llama.params.logit_bias},
990
  {"n_probs", llama.params.n_probs},
991
+ {"grammar", llama.params.grammar},
992
  };
993
  }
994
 
 
1090
  llama.params.n_keep = body.value("n_keep", default_params.n_keep);
1091
  llama.params.seed = body.value("seed", default_params.seed);
1092
  llama.params.prompt = body.value("prompt", default_params.prompt);
1093
+ llama.params.grammar = body.value("grammar", default_params.grammar);
1094
  llama.params.n_probs = body.value("n_probs", default_params.n_probs);
1095
 
1096
  llama.params.logit_bias.clear();
 
1222
 
1223
  parse_options_completion(json::parse(req.body), llama);
1224
 
1225
+ if (!llama.loadGrammar())
1226
+ {
1227
+ res.status = 400;
1228
+ return;
1229
+ }
1230
+
1231
  llama.loadPrompt();
1232
  llama.beginCompletion();
1233
 
 
1323
  sink.done();
1324
  return true;
1325
  };
1326
+ const auto on_complete = [&](bool) {
1327
+ llama.mutex.unlock();
1328
+ };
1329
+ lock.release();
1330
+ res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
1331
  } });
1332
 
1333
  svr.Get("/model.json", [&llama](const Request &, Response &res)
 
1383
 
1384
  svr.set_error_handler([](const Request &, Response &res)
1385
  {
1386
+ if (res.status == 400) {
1387
+ res.set_content("Invalid request", "text/plain");
1388
+ } else {
1389
+ res.set_content("File Not Found", "text/plain");
1390
+ res.status = 404;
1391
+ } });
1392
 
1393
  // set timeouts and change hostname and port
1394
  svr.set_read_timeout(sparams.read_timeout);
 
1416
  return 1;
1417
  }
1418
 
1419
+ if (llama.grammar != nullptr) {
1420
+ llama_grammar_free(llama.grammar);
1421
+ }
1422
  llama_backend_free();
1423
 
1424
  return 0;
examples/simple/simple.cpp CHANGED
@@ -123,7 +123,7 @@ int main(int argc, char ** argv)
123
  // Evaluate the tokens :
124
  //---------------------------------
125
 
126
- if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
127
  {
128
  fprintf( stderr, "%s : failed to eval\n" , __func__ );
129
  return 1;
 
123
  // Evaluate the tokens :
124
  //---------------------------------
125
 
126
+ if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
127
  {
128
  fprintf( stderr, "%s : failed to eval\n" , __func__ );
129
  return 1;
expose.h CHANGED
@@ -30,6 +30,7 @@ struct load_model_inputs
30
  const int batch_size;
31
  const bool f16_kv;
32
  const bool low_vram;
 
33
  const char * executable_path;
34
  const char * model_filename;
35
  const char * lora_filename;
@@ -74,7 +75,7 @@ struct generation_inputs
74
  struct generation_outputs
75
  {
76
  int status = -1;
77
- char text[16384]; //16kb should be enough for any response
78
  };
79
 
80
  extern std::string executable_path;
 
30
  const int batch_size;
31
  const bool f16_kv;
32
  const bool low_vram;
33
+ const bool use_mmq;
34
  const char * executable_path;
35
  const char * model_filename;
36
  const char * lora_filename;
 
75
  struct generation_outputs
76
  {
77
  int status = -1;
78
+ char text[24576]; //24kb should be enough for any response
79
  };
80
 
81
  extern std::string executable_path;
ggml-alloc.c ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-alloc.h"
2
+ #include "ggml.h"
3
+ #include <assert.h>
4
+ #include <stdarg.h>
5
+ #include <stdio.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+
9
+ #define UNUSED(x) (void)(x)
10
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+
12
+ //#define GGML_ALLOCATOR_DEBUG
13
+
14
+ //#define AT_PRINTF printf
15
+ #define AT_PRINTF(...) ((void)0)
16
+
17
+ struct hash_node {
18
+ struct ggml_tensor * t;
19
+ int n_children;
20
+ int n_views;
21
+ };
22
+
23
+ static size_t hash(void * p) {
24
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
25
+ }
26
+
27
+ static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
28
+ size_t h = hash(t);
29
+
30
+ // linear probing
31
+ size_t i = h;
32
+ while (hash_table[i].t != NULL) {
33
+ if (hash_table[i].t == t) {
34
+ return &hash_table[i];
35
+ }
36
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
37
+ if (i == h) {
38
+ // hash table is full
39
+ GGML_ASSERT(false);
40
+ }
41
+ }
42
+
43
+ hash_table[i].t = t;
44
+ return &hash_table[i];
45
+ }
46
+
47
+ // TODO: GGML_PAD ?
48
+ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
49
+ assert(alignment && !(alignment & (alignment - 1))); // power of 2
50
+ size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
51
+ return offset + align;
52
+ }
53
+
54
+ struct free_block {
55
+ void * addr;
56
+ size_t size;
57
+ };
58
+
59
+ #define MAX_FREE_BLOCKS 256
60
+
61
+ struct ggml_allocr {
62
+ void * data;
63
+ size_t size;
64
+ size_t alignment;
65
+ int n_free_blocks;
66
+ struct free_block free_blocks[MAX_FREE_BLOCKS];
67
+ struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
68
+ size_t max_size;
69
+ bool measure;
70
+
71
+ #ifdef GGML_ALLOCATOR_DEBUG
72
+ struct ggml_tensor * allocated_tensors[1024];
73
+ #endif
74
+ };
75
+
76
+ #ifdef GGML_ALLOCATOR_DEBUG
77
+ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
78
+ for (int i = 0; i < 1024; i++) {
79
+ if (alloc->allocated_tensors[i] == NULL) {
80
+ alloc->allocated_tensors[i] = tensor;
81
+ return;
82
+ }
83
+ }
84
+ GGML_ASSERT(!"out of allocated_tensors");
85
+ }
86
+ static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
87
+ for (int i = 0; i < 1024; i++) {
88
+ if (alloc->allocated_tensors[i] == tensor ||
89
+ (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
90
+ alloc->allocated_tensors[i] = NULL;
91
+ return;
92
+ }
93
+ }
94
+ printf("tried to free tensor %s not found\n", tensor->name);
95
+ GGML_ASSERT(!"tensor not found");
96
+ }
97
+ #endif
98
+
99
+
100
+ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
101
+ return ggml_nbytes(tensor);
102
+
103
+ UNUSED(alloc);
104
+ }
105
+
106
+ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
107
+ size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
108
+ size = aligned_offset(NULL, size, alloc->alignment);
109
+
110
+ AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
111
+
112
+ size_t max_avail = 0;
113
+
114
+ // find the best fitting free block
115
+ int best_fit_block = -1;
116
+ size_t best_fit_size = SIZE_MAX;
117
+ for (int i = 0; i < alloc->n_free_blocks; i++) {
118
+ struct free_block * block = &alloc->free_blocks[i];
119
+ max_avail = MAX(max_avail, block->size);
120
+ if (block->size >= size && block->size <= best_fit_size) {
121
+ best_fit_block = i;
122
+ best_fit_size = block->size;
123
+ }
124
+ }
125
+
126
+ AT_PRINTF("block %d\n", best_fit_block);
127
+
128
+ if (best_fit_block == -1) {
129
+ fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
130
+ __func__, size, max_avail);
131
+ GGML_ASSERT(!"not enough space in the buffer");
132
+ return;
133
+ }
134
+ struct free_block * block = &alloc->free_blocks[best_fit_block];
135
+ void * addr = block->addr;
136
+ block->addr = (char*)block->addr + size;
137
+ block->size -= size;
138
+ if (block->size == 0) {
139
+ // remove block if empty
140
+ alloc->n_free_blocks--;
141
+ for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
142
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
143
+ }
144
+ }
145
+
146
+ tensor->data = addr;
147
+
148
+ #ifdef GGML_ALLOCATOR_DEBUG
149
+ add_allocated_tensor(alloc, tensor);
150
+ size_t cur_max = (char*)addr - (char*)alloc->data + size;
151
+ if (cur_max > alloc->max_size) {
152
+ printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
153
+ for (int i = 0; i < 1024; i++) {
154
+ if (alloc->allocated_tensors[i]) {
155
+ printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
156
+ }
157
+ }
158
+ printf("\n");
159
+ }
160
+ #endif
161
+
162
+ alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
163
+ }
164
+
165
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
166
+ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
167
+ void * ptr = tensor->data;
168
+
169
+ if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
170
+ // the tensor was not allocated in this buffer
171
+ // this can happen because the graph allocator will try to free weights and other tensors from different buffers
172
+ // the easiest way to deal with this is just to ignore it
173
+ return;
174
+ }
175
+
176
+ size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
177
+ size = aligned_offset(NULL, size, alloc->alignment);
178
+ AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
179
+
180
+ #ifdef GGML_ALLOCATOR_DEBUG
181
+ remove_allocated_tensor(alloc, tensor);
182
+ #endif
183
+
184
+ // see if we can merge with an existing block
185
+ for (int i = 0; i < alloc->n_free_blocks; i++) {
186
+ struct free_block * block = &alloc->free_blocks[i];
187
+ // check if ptr is at the end of the block
188
+ if ((char*)block->addr + block->size == ptr) {
189
+ block->size += size;
190
+ // check if we can merge with the next block
191
+ if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
192
+ block->size += alloc->free_blocks[i+1].size;
193
+ alloc->n_free_blocks--;
194
+ for (int j = i+1; j < alloc->n_free_blocks; j++) {
195
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
196
+ }
197
+ }
198
+ return;
199
+ }
200
+ // check if ptr is at the beginning of the block
201
+ if ((char*)ptr + size == block->addr) {
202
+ block->addr = ptr;
203
+ block->size += size;
204
+ // check if we can merge with the previous block
205
+ if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
206
+ alloc->free_blocks[i-1].size += block->size;
207
+ alloc->n_free_blocks--;
208
+ for (int j = i; j < alloc->n_free_blocks; j++) {
209
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
210
+ }
211
+ }
212
+ return;
213
+ }
214
+ }
215
+ // otherwise, add a new block
216
+ GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
217
+ // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
218
+ int insert_pos = 0;
219
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
220
+ insert_pos++;
221
+ }
222
+ // shift all blocks from insert_pos onward to make room for the new block
223
+ for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
224
+ alloc->free_blocks[i] = alloc->free_blocks[i-1];
225
+ }
226
+ // insert the new block
227
+ alloc->free_blocks[insert_pos].addr = ptr;
228
+ alloc->free_blocks[insert_pos].size = size;
229
+ alloc->n_free_blocks++;
230
+ }
231
+
232
+ void ggml_allocr_reset(struct ggml_allocr * alloc) {
233
+ alloc->n_free_blocks = 1;
234
+ size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
235
+ alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
236
+ alloc->free_blocks[0].size = alloc->size - align_offset;
237
+ }
238
+
239
+ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
240
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
241
+
242
+ *alloc = (struct ggml_allocr){
243
+ /*.data = */ data,
244
+ /*.size = */ size,
245
+ /*.alignment = */ alignment,
246
+ /*.n_free_blocks = */ 0,
247
+ /*.free_blocks = */ {{0}},
248
+ /*.hash_table = */ {{0}},
249
+ /*.max_size = */ 0,
250
+ /*.measure = */ false,
251
+ #ifdef GGML_ALLOCATOR_DEBUG
252
+ /*.allocated_tensors = */ = {0},
253
+ #endif
254
+ };
255
+
256
+ ggml_allocr_reset(alloc);
257
+
258
+ return alloc;
259
+ }
260
+
261
+ // address and size of the buffer when measuring
262
+ // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
263
+ static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
264
+ static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
265
+
266
+ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
267
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
268
+
269
+ *alloc = (struct ggml_allocr){
270
+ /*.data = */ MEASURE_BASE_ADDR,
271
+ /*.size = */ MEASURE_MAX_SIZE,
272
+ /*.alignment = */ alignment,
273
+ /*.n_free_blocks = */ 0,
274
+ /*.free_blocks = */ {{0}},
275
+ /*.hash_table = */ {{0}},
276
+ /*.max_size = */ 0,
277
+ /*.measure = */ true,
278
+ #ifdef GGML_ALLOCATOR_DEBUG
279
+ /*.allocated_tensors = */ = {0},
280
+ #endif
281
+ };
282
+
283
+ ggml_allocr_reset(alloc);
284
+
285
+ return alloc;
286
+ }
287
+
288
+ void ggml_allocr_free(struct ggml_allocr * alloc) {
289
+ free(alloc);
290
+ }
291
+
292
+ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
293
+ return alloc->measure;
294
+ }
295
+
296
+ //////////// compute graph allocator
297
+
298
+ static bool ggml_is_view(struct ggml_tensor * t) {
299
+ return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
300
+ t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
301
+ }
302
+
303
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
304
+ if (a->type != b->type) {
305
+ return false;
306
+ }
307
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
308
+ if (a->ne[i] != b->ne[i]) {
309
+ return false;
310
+ }
311
+ if (a->nb[i] != b->nb[i]) {
312
+ return false;
313
+ }
314
+ }
315
+ return true;
316
+ }
317
+
318
+ static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
319
+ switch (t->op) {
320
+ case GGML_OP_PERMUTE:
321
+ case GGML_OP_RESHAPE:
322
+ case GGML_OP_TRANSPOSE:
323
+ case GGML_OP_VIEW:
324
+ return t->src[0];
325
+ case GGML_OP_CPY:
326
+ return t->src[1];
327
+ default:
328
+ return NULL;
329
+ }
330
+ }
331
+
332
+ static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
333
+ struct ggml_tensor * parent = t;
334
+ do {
335
+ parent = get_view_parent(parent);
336
+ } while (ggml_is_view(parent));
337
+ return parent;
338
+ }
339
+
340
+ static bool ggml_op_can_inplace(enum ggml_op op) {
341
+ switch (op) {
342
+ case GGML_OP_SCALE:
343
+ case GGML_OP_DIAG_MASK_ZERO:
344
+ case GGML_OP_DIAG_MASK_INF:
345
+ case GGML_OP_ADD:
346
+ case GGML_OP_ADD1:
347
+ case GGML_OP_ACC:
348
+ case GGML_OP_SUB:
349
+ case GGML_OP_MUL:
350
+ case GGML_OP_DIV:
351
+ case GGML_OP_SQR:
352
+ case GGML_OP_SQRT:
353
+ case GGML_OP_LOG:
354
+ case GGML_OP_UNARY:
355
+ case GGML_OP_ROPE:
356
+ case GGML_OP_RMS_NORM:
357
+ case GGML_OP_SET:
358
+ case GGML_OP_SOFT_MAX:
359
+ case GGML_OP_CONT:
360
+ return true;
361
+
362
+ default:
363
+ return false;
364
+ }
365
+ }
366
+
367
+ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
368
+ struct hash_node * ht = alloc->hash_table;
369
+ if (node->data == NULL) {
370
+ if (ggml_is_view(node)) {
371
+ size_t offset;
372
+ switch(node->op) {
373
+ case GGML_OP_VIEW:
374
+ memcpy(&offset, node->op_params, sizeof(size_t));
375
+ node->data = (char *) node->src[0]->data + offset;
376
+ break;
377
+ case GGML_OP_PERMUTE:
378
+ case GGML_OP_RESHAPE:
379
+ case GGML_OP_TRANSPOSE:
380
+ node->data = node->src[0]->data;
381
+ break;
382
+ case GGML_OP_CPY:
383
+ node->data = node->src[1]->data;
384
+ break;
385
+ default:
386
+ GGML_ASSERT(!"unknown view op");
387
+ break;
388
+ }
389
+ } else {
390
+ // see if we can reuse a parent's buffer (inplace)
391
+ if (ggml_op_can_inplace(node->op)) {
392
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
393
+ struct ggml_tensor * parent = node->src[i];
394
+ if (parent == NULL) {
395
+ break;
396
+ }
397
+ struct hash_node * p_hn = hash_get(ht, parent);
398
+ if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
399
+ if (ggml_is_view(parent)) {
400
+ struct ggml_tensor * view_src = get_view_source(parent);
401
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
402
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
403
+ // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
404
+ // the parent's data that it will need later (same layout requirement). the problem is that then
405
+ // we cannot free the tensor because the original address of the allocation is lost.
406
+ // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
407
+ // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
408
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
409
+ node->data = parent->data;
410
+ return;
411
+ }
412
+ }
413
+ else {
414
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
415
+ node->data = parent->data;
416
+ }
417
+ return;
418
+ }
419
+ }
420
+ }
421
+ ggml_allocr_alloc(alloc, node);
422
+ }
423
+ }
424
+ }
425
+
426
+ static size_t ggml_allocator_alloc_graph_tensors_n(
427
+ struct ggml_allocr * alloc,
428
+ struct ggml_cgraph ** graphs, int n_graphs,
429
+ struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
430
+
431
+ // reset hash table
432
+ struct hash_node * ht = alloc->hash_table;
433
+ memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
434
+
435
+ // count number of children and views
436
+ for (int g = 0; g < n_graphs; g++) {
437
+ struct ggml_cgraph * gf = graphs[g];
438
+ for (int i = 0; i < gf->n_nodes; i++) {
439
+ struct ggml_tensor * node = gf->nodes[i];
440
+
441
+ if (ggml_is_view(node)) {
442
+ struct ggml_tensor * view_src = get_view_source(node);
443
+ hash_get(ht, view_src)->n_views += 1;
444
+ }
445
+
446
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
447
+ struct ggml_tensor * parent = node->src[j];
448
+ if (parent == NULL) {
449
+ break;
450
+ }
451
+ hash_get(ht, parent)->n_children += 1;
452
+ }
453
+ }
454
+ }
455
+
456
+ // allocate tensors
457
+ for (int g = 0; g < n_graphs; g++) {
458
+ struct ggml_cgraph * gf = graphs[g];
459
+ AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
460
+ // graph inputs are allocated first to ensure that they are not overwritten by each other
461
+ if (inputs != NULL && inputs[g] != NULL) {
462
+ for (int i = 0; inputs[g][i] != NULL; i++) {
463
+ struct ggml_tensor * input = inputs[g][i];
464
+ AT_PRINTF("input: %s\n", input->name);
465
+ allocate_node(alloc, input);
466
+ }
467
+ }
468
+ for (int i = 0; i < gf->n_nodes; i++) {
469
+ struct ggml_tensor * node = gf->nodes[i];
470
+
471
+ // allocate parents (leafs)
472
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
473
+ struct ggml_tensor * parent = node->src[j];
474
+ if (parent == NULL) {
475
+ break;
476
+ }
477
+ allocate_node(alloc, parent);
478
+ }
479
+
480
+ // allocate node
481
+ allocate_node(alloc, node);
482
+
483
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
484
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
485
+ struct ggml_tensor * parent = node->src[j];
486
+ if (parent == NULL) {
487
+ break;
488
+ }
489
+ AT_PRINTF("%s", parent->name);
490
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
491
+ AT_PRINTF(", ");
492
+ }
493
+ }
494
+ AT_PRINTF("\n");
495
+
496
+ // update parents
497
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
498
+ struct ggml_tensor * parent = node->src[j];
499
+ if (parent == NULL) {
500
+ break;
501
+ }
502
+ struct hash_node * p_hn = hash_get(ht, parent);
503
+ p_hn->n_children -= 1;
504
+
505
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
506
+
507
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
508
+ if (ggml_is_view(parent)) {
509
+ struct ggml_tensor * view_src = get_view_source(parent);
510
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
511
+ view_src_hn->n_views -= 1;
512
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
513
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
514
+ ggml_allocator_free_tensor(alloc, view_src);
515
+ }
516
+ }
517
+ else {
518
+ if (parent->data != node->data) {
519
+ ggml_allocator_free_tensor(alloc, parent);
520
+ }
521
+ }
522
+ }
523
+ }
524
+ AT_PRINTF("\n");
525
+ }
526
+ // free graph outputs here that wouldn't be freed otherwise because they have no children
527
+ if (outputs != NULL && outputs[g] != NULL) {
528
+ for (int i = 0; outputs[g][i] != NULL; i++) {
529
+ struct ggml_tensor * output = outputs[g][i];
530
+ AT_PRINTF("output: %s\n", output->name);
531
+ ggml_allocator_free_tensor(alloc, output);
532
+ }
533
+ }
534
+ }
535
+
536
+ return alloc->max_size;
537
+ }
538
+
539
+ size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
540
+ return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
541
+ }
ggml-alloc.h ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif
8
+
9
+
10
+ GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
+ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
+
13
+ GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
14
+ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
15
+ GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
16
+ GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
17
+ GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
18
+
19
+
20
+ #ifdef __cplusplus
21
+ }
22
+ #endif
ggml-cuda.cu CHANGED
The diff for this file is too large to render. See raw diff
 
ggml-cuda.h CHANGED
@@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
27
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
28
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
29
  void ggml_cuda_set_main_device(int main_device);
 
30
  void ggml_cuda_set_scratch_size(size_t scratch_size);
31
  void ggml_cuda_free_scratch(void);
32
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 
27
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
28
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
29
  void ggml_cuda_set_main_device(int main_device);
30
+ void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
31
  void ggml_cuda_set_scratch_size(size_t scratch_size);
32
  void ggml_cuda_free_scratch(void);
33
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
ggml-metal.m CHANGED
@@ -7,6 +7,11 @@
7
  #import <Metal/Metal.h>
8
  #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
9
 
 
 
 
 
 
10
  #ifdef GGML_METAL_NDEBUG
11
  #define metal_printf(...)
12
  #else
@@ -15,6 +20,8 @@
15
 
16
  #define UNUSED(x) (void)(x)
17
 
 
 
18
  struct ggml_metal_buffer {
19
  const char * name;
20
 
@@ -36,7 +43,7 @@ struct ggml_metal_context {
36
  int n_buffers;
37
  struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
38
 
39
- int concur_list[GGML_MAX_NODES];
40
  int concur_list_len;
41
 
42
  // custom kernels
@@ -370,15 +377,15 @@ void ggml_metal_graph_find_concurrency(
370
  struct ggml_metal_context * ctx,
371
  struct ggml_cgraph * gf) {
372
  int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
373
- int nodes_unused[GGML_MAX_NODES];
374
 
375
- for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
376
- for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
377
  ctx->concur_list_len = 0;
378
 
379
- int n_left = gf->n_nodes;
380
- int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
381
- int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
382
 
383
  while (n_left > 0) {
384
  // number of nodes at a layer (that can be issued concurrently)
@@ -386,28 +393,40 @@ void ggml_metal_graph_find_concurrency(
386
  for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
387
  if (nodes_unused[i]) {
388
  // if the requirements for gf->nodes[i] are satisfied
389
- int exe_flag=1;
 
390
  // scan all srcs
391
  for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
392
  struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
393
  if (src_cur) {
394
  // if is leaf nodes it's satisfied.
395
- if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
 
 
 
396
 
397
  // otherwise this src should be the output from previous nodes.
398
  int is_found = 0;
 
399
  // scan 2*search_depth back because we inserted barrier.
400
- for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
401
- if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
 
 
 
 
 
 
 
 
402
  }
403
- if (is_found == 0) {exe_flag = 0; break;}
404
  }
405
  }
406
  if (exe_flag) {
407
  // check if nodes[i]'s data will be overwritten by a node before nodes[i].
408
  // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
409
  int64_t data_start = (int64_t) gf->nodes[i]->data;
410
- int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
411
  for (int j = n_start; j < i; j++) {
412
  if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
413
  && gf->nodes[j]->op != GGML_OP_VIEW \
@@ -416,9 +435,9 @@ void ggml_metal_graph_find_concurrency(
416
  if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
417
  ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
418
  continue;
419
- } else {
420
- exe_flag = 0;
421
  }
 
 
422
  }
423
  }
424
  }
@@ -435,11 +454,13 @@ void ggml_metal_graph_find_concurrency(
435
  ctx->concur_list[level_pos + concurrency] = -1;
436
  ctx->concur_list_len++;
437
  // jump all sorted nodes at nodes_bak
438
- while (!nodes_unused[n_start]) {n_start++;}
 
 
439
  level_pos += concurrency + 1;
440
  }
441
 
442
- if (ctx->concur_list_len > GGML_MAX_NODES) {
443
  fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
444
  }
445
  }
@@ -453,7 +474,7 @@ void ggml_metal_graph_compute(
453
  // else fallback to serial dispatch
454
  MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
455
 
456
- const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
457
 
458
  const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
459
  edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
@@ -718,7 +739,8 @@ void ggml_metal_graph_compute(
718
  // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
719
 
720
  GGML_ASSERT(ne00 == ne10);
721
- GGML_ASSERT(ne02 == ne12);
 
722
 
723
  if (ggml_is_contiguous(src0) &&
724
  ggml_is_contiguous(src1) &&
@@ -746,11 +768,11 @@ void ggml_metal_graph_compute(
746
  initWithDevice:ctx->device transposeLeft:false transposeRight:true
747
  resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
748
 
749
- // we need to do ne02 multiplications
750
  // TODO: is there a way to do this in parallel - currently very slow ..
751
  // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
752
- for (int64_t i02 = 0; i02 < ne02; ++i02) {
753
- size_t offs_src0_cur = offs_src0 + i02*nb02;
754
  size_t offs_src1_cur = offs_src1 + i02*nb12;
755
  size_t offs_dst_cur = offs_dst + i02*nb2;
756
 
@@ -772,8 +794,6 @@ void ggml_metal_graph_compute(
772
  switch (src0t) {
773
  case GGML_TYPE_F16:
774
  {
775
- GGML_ASSERT(ne02 == ne12);
776
-
777
  nth0 = 64;
778
  nth1 = 1;
779
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@@ -853,16 +873,18 @@ void ggml_metal_graph_compute(
853
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
854
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
855
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
856
- [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
857
- [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
858
- [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
859
- [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
860
- [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
861
- [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
862
- [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
863
- [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
864
- [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
865
- [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
 
 
866
 
867
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
868
  src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
 
7
  #import <Metal/Metal.h>
8
  #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
9
 
10
+ #undef MIN
11
+ #undef MAX
12
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
13
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
+
15
  #ifdef GGML_METAL_NDEBUG
16
  #define metal_printf(...)
17
  #else
 
20
 
21
  #define UNUSED(x) (void)(x)
22
 
23
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
24
+
25
  struct ggml_metal_buffer {
26
  const char * name;
27
 
 
43
  int n_buffers;
44
  struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
45
 
46
+ int concur_list[GGML_MAX_CONCUR];
47
  int concur_list_len;
48
 
49
  // custom kernels
 
377
  struct ggml_metal_context * ctx,
378
  struct ggml_cgraph * gf) {
379
  int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
380
+ int nodes_unused[GGML_MAX_CONCUR];
381
 
382
+ for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
383
+ for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; }
384
  ctx->concur_list_len = 0;
385
 
386
+ int n_left = gf->n_nodes;
387
+ int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
388
+ int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
389
 
390
  while (n_left > 0) {
391
  // number of nodes at a layer (that can be issued concurrently)
 
393
  for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
394
  if (nodes_unused[i]) {
395
  // if the requirements for gf->nodes[i] are satisfied
396
+ int exe_flag = 1;
397
+
398
  // scan all srcs
399
  for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
400
  struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
401
  if (src_cur) {
402
  // if is leaf nodes it's satisfied.
403
+ // TODO: ggml_is_leaf()
404
+ if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
405
+ continue;
406
+ }
407
 
408
  // otherwise this src should be the output from previous nodes.
409
  int is_found = 0;
410
+
411
  // scan 2*search_depth back because we inserted barrier.
412
+ //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
413
+ for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
414
+ if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
415
+ is_found = 1;
416
+ break;
417
+ }
418
+ }
419
+ if (is_found == 0) {
420
+ exe_flag = 0;
421
+ break;
422
  }
 
423
  }
424
  }
425
  if (exe_flag) {
426
  // check if nodes[i]'s data will be overwritten by a node before nodes[i].
427
  // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
428
  int64_t data_start = (int64_t) gf->nodes[i]->data;
429
+ int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
430
  for (int j = n_start; j < i; j++) {
431
  if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
432
  && gf->nodes[j]->op != GGML_OP_VIEW \
 
435
  if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
436
  ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
437
  continue;
 
 
438
  }
439
+
440
+ exe_flag = 0;
441
  }
442
  }
443
  }
 
454
  ctx->concur_list[level_pos + concurrency] = -1;
455
  ctx->concur_list_len++;
456
  // jump all sorted nodes at nodes_bak
457
+ while (!nodes_unused[n_start]) {
458
+ n_start++;
459
+ }
460
  level_pos += concurrency + 1;
461
  }
462
 
463
+ if (ctx->concur_list_len > GGML_MAX_CONCUR) {
464
  fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
465
  }
466
  }
 
474
  // else fallback to serial dispatch
475
  MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
476
 
477
+ const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
478
 
479
  const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
480
  edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
 
739
  // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
740
 
741
  GGML_ASSERT(ne00 == ne10);
742
+ // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
743
+ GGML_ASSERT(ne03 == ne13);
744
 
745
  if (ggml_is_contiguous(src0) &&
746
  ggml_is_contiguous(src1) &&
 
768
  initWithDevice:ctx->device transposeLeft:false transposeRight:true
769
  resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
770
 
771
+ // we need to do ne12 multiplications
772
  // TODO: is there a way to do this in parallel - currently very slow ..
773
  // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
774
+ for (int64_t i02 = 0; i02 < ne12; ++i02) {
775
+ size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
776
  size_t offs_src1_cur = offs_src1 + i02*nb12;
777
  size_t offs_dst_cur = offs_dst + i02*nb2;
778
 
 
794
  switch (src0t) {
795
  case GGML_TYPE_F16:
796
  {
 
 
797
  nth0 = 64;
798
  nth1 = 1;
799
  [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
 
873
  [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
874
  [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
875
  [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
876
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
877
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
878
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
879
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
880
+ [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
881
+ [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
882
+ [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
883
+ [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
884
+ [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
885
+ [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
886
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
887
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
888
 
889
  if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
890
  src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
ggml-metal.metal CHANGED
@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
509
  device float * dst,
510
  constant int64_t & ne00,
511
  constant int64_t & ne01,
 
512
  constant uint64_t & nb00,
513
  constant uint64_t & nb01,
514
  constant uint64_t & nb02,
515
  constant int64_t & ne10,
516
  constant int64_t & ne11,
 
517
  constant uint64_t & nb10,
518
  constant uint64_t & nb11,
519
  constant uint64_t & nb12,
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
529
  const int64_t r1 = tgpig.y;
530
  const int64_t im = tgpig.z;
531
 
532
- device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
533
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
534
 
535
  sum[tpitg.x] = 0.0f;
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
552
  }
553
  }
554
 
 
555
  kernel void kernel_alibi_f32(
556
  device const float * src0,
557
  device float * dst,
 
509
  device float * dst,
510
  constant int64_t & ne00,
511
  constant int64_t & ne01,
512
+ constant int64_t & ne02,
513
  constant uint64_t & nb00,
514
  constant uint64_t & nb01,
515
  constant uint64_t & nb02,
516
  constant int64_t & ne10,
517
  constant int64_t & ne11,
518
+ constant int64_t & ne12,
519
  constant uint64_t & nb10,
520
  constant uint64_t & nb11,
521
  constant uint64_t & nb12,
 
531
  const int64_t r1 = tgpig.y;
532
  const int64_t im = tgpig.z;
533
 
534
+ device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
535
  device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
536
 
537
  sum[tpitg.x] = 0.0f;
 
554
  }
555
  }
556
 
557
+
558
  kernel void kernel_alibi_f32(
559
  device const float * src0,
560
  device float * dst,
ggml.c CHANGED
@@ -195,8 +195,8 @@ typedef void * thread_ret_t;
195
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
  #else
198
- inline static void* ggml_aligned_malloc(size_t size) {
199
- void* aligned_memory = NULL;
200
  #ifdef GGML_USE_METAL
201
  int result = posix_memalign(&aligned_memory, getpagesize(), size);
202
  #else
@@ -3812,7 +3812,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3812
  "CROSS_ENTROPY_LOSS_BACK",
3813
  };
3814
 
3815
- static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3816
 
3817
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3818
  "none",
@@ -3884,7 +3884,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3884
  "cross_entropy_loss_back(x,y)",
3885
  };
3886
 
3887
- static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
3888
 
3889
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3890
 
@@ -4072,8 +4072,8 @@ bool ggml_is_numa(void) {
4072
  ////////////////////////////////////////////////////////////////////////////////
4073
 
4074
  void ggml_print_object(const struct ggml_object * obj) {
4075
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
4076
- obj->offs, obj->size, (const void *) obj->next);
4077
  }
4078
 
4079
  void ggml_print_objects(const struct ggml_context * ctx) {
@@ -4111,7 +4111,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4111
  //
4112
  // is enough, but just in case, adding the second part
4113
 
4114
- return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
4115
  }
4116
 
4117
  size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@@ -4213,7 +4213,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4213
  }
4214
 
4215
  size_t ggml_tensor_overhead(void) {
4216
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
4217
  }
4218
 
4219
  bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@@ -4254,7 +4254,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
4254
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4255
  }
4256
 
4257
- static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
4258
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4259
 
4260
  return
@@ -4384,7 +4384,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4384
  return NULL;
4385
  }
4386
 
4387
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
4388
 
4389
  *ctx = (struct ggml_context) {
4390
  /*.mem_size =*/ mem_size,
@@ -4473,12 +4473,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4473
  struct ggml_object * obj = ctx->objects_begin;
4474
 
4475
  while (obj != NULL) {
4476
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
 
4477
 
4478
- const size_t size = ggml_nbytes(tensor);
4479
 
4480
- if (max_size < size) {
4481
- max_size = size;
 
4482
  }
4483
 
4484
  obj = obj->next;
@@ -4510,12 +4512,7 @@ static void ggml_scratch_load(struct ggml_context * ctx) {
4510
 
4511
  ////////////////////////////////////////////////////////////////////////////////
4512
 
4513
- static struct ggml_tensor * ggml_new_tensor_impl(
4514
- struct ggml_context * ctx,
4515
- enum ggml_type type,
4516
- int n_dims,
4517
- const int64_t* ne,
4518
- void* data) {
4519
  // always insert objects at the end of the context's memory pool
4520
  struct ggml_object * obj_cur = ctx->objects_end;
4521
 
@@ -4523,77 +4520,81 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4523
  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
4524
  const size_t cur_end = cur_offs + cur_size;
4525
 
4526
- size_t size_needed = 0;
4527
-
4528
- if (data == NULL && !ctx->no_alloc) {
4529
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4530
- for (int i = 1; i < n_dims; i++) {
4531
- size_needed *= ne[i];
4532
- }
4533
- // align to GGML_MEM_ALIGN
4534
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
4535
- }
4536
 
4537
  char * const mem_buffer = ctx->mem_buffer;
4538
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4539
 
4540
- if (ctx->scratch.data == NULL || data != NULL) {
4541
- size_needed += GGML_TENSOR_SIZE;
 
 
 
 
4542
 
4543
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4544
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4545
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
4546
- assert(false);
4547
- return NULL;
4548
- }
4549
 
4550
- *obj_new = (struct ggml_object) {
4551
- .offs = cur_end + GGML_OBJECT_SIZE,
4552
- .size = size_needed,
4553
- .next = NULL,
4554
- };
4555
  } else {
4556
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4557
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4558
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4559
- assert(false);
4560
- return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4561
  }
 
4562
 
4563
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4564
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4565
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
 
 
4566
  assert(false);
4567
  return NULL;
4568
  }
4569
 
4570
  data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4571
 
4572
- *obj_new = (struct ggml_object) {
4573
- .offs = cur_end + GGML_OBJECT_SIZE,
4574
- .size = GGML_TENSOR_SIZE,
4575
- .next = NULL,
4576
- };
4577
-
4578
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
4579
 
4580
- ctx->scratch.offs += size_needed;
4581
  }
4582
 
4583
- if (obj_cur != NULL) {
4584
- obj_cur->next = obj_new;
4585
- } else {
4586
- // this is the first object in this context
4587
- ctx->objects_begin = obj_new;
4588
- }
4589
-
4590
- ctx->objects_end = obj_new;
4591
 
4592
- //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4593
-
4594
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
4595
 
4596
- ggml_assert_aligned(result);
4597
 
4598
  *result = (struct ggml_tensor) {
4599
  /*.type =*/ type,
@@ -4602,7 +4603,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4602
  /*.ne =*/ { 1, 1, 1, 1 },
4603
  /*.nb =*/ { 0, 0, 0, 0 },
4604
  /*.op =*/ GGML_OP_NONE,
4605
- /*.op_params =*/ {0},
4606
  /*.is_param =*/ false,
4607
  /*.grad =*/ NULL,
4608
  /*.src =*/ { NULL },
@@ -4634,6 +4635,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4634
  }
4635
 
4636
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
 
4637
  assert(params_size <= GGML_MAX_OP_PARAMS);
4638
  memcpy(tensor->op_params, params, params_size);
4639
  }
@@ -4650,22 +4652,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
4650
 
4651
  struct ggml_tensor * ggml_new_tensor(
4652
  struct ggml_context * ctx,
4653
- enum ggml_type type,
4654
- int n_dims,
4655
- const int64_t * ne) {
4656
  return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4657
  }
4658
 
4659
  struct ggml_tensor * ggml_new_tensor_1d(
4660
  struct ggml_context * ctx,
4661
- enum ggml_type type,
4662
  int64_t ne0) {
4663
  return ggml_new_tensor(ctx, type, 1, &ne0);
4664
  }
4665
 
4666
  struct ggml_tensor * ggml_new_tensor_2d(
4667
  struct ggml_context * ctx,
4668
- enum ggml_type type,
4669
  int64_t ne0,
4670
  int64_t ne1) {
4671
  const int64_t ne[2] = { ne0, ne1 };
@@ -4674,7 +4676,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
4674
 
4675
  struct ggml_tensor * ggml_new_tensor_3d(
4676
  struct ggml_context * ctx,
4677
- enum ggml_type type,
4678
  int64_t ne0,
4679
  int64_t ne1,
4680
  int64_t ne2) {
@@ -4984,11 +4986,6 @@ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
4984
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4985
  }
4986
 
4987
- static void ggml_set_unary_op(struct ggml_tensor * tensor, enum ggml_unary_op op) {
4988
- GGML_ASSERT(tensor->op = GGML_OP_UNARY);
4989
- ggml_set_op_params_i32(tensor, 0, (int32_t) op);
4990
- }
4991
-
4992
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
4993
  return tensor->name;
4994
  }
@@ -5027,9 +5024,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
5027
  char * const mem_buffer = ctx->mem_buffer;
5028
 
5029
  while (obj != NULL) {
5030
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5031
- if (strcmp(cur->name, name) == 0) {
5032
- return cur;
 
 
5033
  }
5034
 
5035
  obj = obj->next;
@@ -6243,6 +6242,27 @@ struct ggml_tensor * ggml_reshape_4d(
6243
 
6244
  // ggml_view_1d
6245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6246
  struct ggml_tensor * ggml_view_1d(
6247
  struct ggml_context * ctx,
6248
  struct ggml_tensor * a,
@@ -6255,10 +6275,7 @@ struct ggml_tensor * ggml_view_1d(
6255
  is_node = true;
6256
  }
6257
 
6258
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6259
- ggml_format_name(result, "%s (view)", a->name);
6260
-
6261
- ggml_set_op_params(result, &offset, sizeof(offset));
6262
 
6263
  result->op = GGML_OP_VIEW;
6264
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6285,10 +6302,7 @@ struct ggml_tensor * ggml_view_2d(
6285
 
6286
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6287
 
6288
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6289
- ggml_format_name(result, "%s (view)", a->name);
6290
-
6291
- ggml_set_op_params(result, &offset, sizeof(offset));
6292
 
6293
  result->nb[1] = nb1;
6294
  result->nb[2] = result->nb[1]*ne1;
@@ -6321,10 +6335,7 @@ struct ggml_tensor * ggml_view_3d(
6321
 
6322
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6323
 
6324
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6325
- ggml_format_name(result, "%s (view)", a->name);
6326
-
6327
- ggml_set_op_params(result, &offset, sizeof(offset));
6328
 
6329
  result->nb[1] = nb1;
6330
  result->nb[2] = nb2;
@@ -6359,10 +6370,7 @@ struct ggml_tensor * ggml_view_4d(
6359
 
6360
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6361
 
6362
- struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6363
- ggml_format_name(result, "%s (view)", a->name);
6364
-
6365
- ggml_set_op_params(result, &offset, sizeof(offset));
6366
 
6367
  result->nb[1] = nb1;
6368
  result->nb[2] = nb2;
@@ -6433,7 +6441,7 @@ struct ggml_tensor * ggml_permute(
6433
  result->src[0] = a;
6434
 
6435
  int32_t params[] = { axis0, axis1, axis2, axis3 };
6436
- ggml_set_op_params(result, &params, sizeof(params));
6437
 
6438
  return result;
6439
  }
@@ -6559,7 +6567,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
6559
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6560
 
6561
  int32_t params[] = { n_past, inplace ? 1 : 0 };
6562
- ggml_set_op_params(result, &params, sizeof(params));
6563
 
6564
  result->op = GGML_OP_DIAG_MASK_INF;
6565
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6599,7 +6607,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
6599
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6600
 
6601
  int32_t params[] = { n_past, inplace ? 1 : 0 };
6602
- ggml_set_op_params(result, &params, sizeof(params));
6603
 
6604
  result->op = GGML_OP_DIAG_MASK_ZERO;
6605
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6715,9 +6723,9 @@ static struct ggml_tensor * ggml_rope_impl(
6715
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6716
 
6717
  int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6718
- memcpy(params + 4, &freq_base, sizeof(float));
6719
  memcpy(params + 5, &freq_scale, sizeof(float));
6720
- ggml_set_op_params(result, &params, sizeof(params));
6721
 
6722
  result->op = GGML_OP_ROPE;
6723
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6746,6 +6754,18 @@ struct ggml_tensor * ggml_rope_inplace(
6746
  return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
6747
  }
6748
 
 
 
 
 
 
 
 
 
 
 
 
 
6749
  struct ggml_tensor * ggml_rope_custom_inplace(
6750
  struct ggml_context * ctx,
6751
  struct ggml_tensor * a,
@@ -6779,7 +6799,7 @@ struct ggml_tensor * ggml_rope_back(
6779
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6780
 
6781
  int32_t params[] = { n_past, n_dims, mode, n_ctx };
6782
- ggml_set_op_params(result, &params, sizeof(params));
6783
 
6784
  result->op = GGML_OP_ROPE_BACK;
6785
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6810,7 +6830,7 @@ struct ggml_tensor * ggml_alibi(
6810
 
6811
  int32_t op_params[3] = { n_past, n_head };
6812
  memcpy(op_params + 2, &bias_max, sizeof(float));
6813
- ggml_set_op_params(result, &op_params, sizeof(op_params));
6814
 
6815
  result->op = GGML_OP_ALIBI;
6816
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6837,7 +6857,7 @@ struct ggml_tensor * ggml_clamp(
6837
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6838
 
6839
  float params[] = { min, max };
6840
- ggml_set_op_params(result, &params, sizeof(params));
6841
 
6842
  result->op = GGML_OP_CLAMP;
6843
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6872,10 +6892,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6872
  ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
6873
  a->ne[2], 1, 1,
6874
  };
6875
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6876
 
6877
  int32_t params[] = { s0, p0, d0 };
6878
- ggml_set_op_params(result, &params, sizeof(params));
6879
 
6880
  result->op = GGML_OP_CONV_1D;
6881
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6887,10 +6907,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6887
 
6888
  // ggml_conv_2d
6889
 
6890
- struct ggml_tensor* ggml_conv_2d(
6891
- struct ggml_context* ctx,
6892
- struct ggml_tensor * a,
6893
- struct ggml_tensor * b,
6894
  int s0,
6895
  int s1,
6896
  int p0,
@@ -6911,10 +6931,10 @@ struct ggml_tensor* ggml_conv_2d(
6911
  ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
6912
  a->ne[3], b->ne[3],
6913
  };
6914
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6915
 
6916
  int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6917
- ggml_set_op_params(result, &params, sizeof(params));
6918
 
6919
  result->op = GGML_OP_CONV_2D;
6920
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6927,7 +6947,7 @@ struct ggml_tensor* ggml_conv_2d(
6927
 
6928
  // ggml_conv_1d_ph
6929
 
6930
- struct ggml_tensor* ggml_conv_1d_ph(
6931
  struct ggml_context * ctx,
6932
  struct ggml_tensor * a,
6933
  struct ggml_tensor * b,
@@ -6945,7 +6965,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
6945
 
6946
  // ggml_pool_1d
6947
 
6948
- struct ggml_tensor* ggml_pool_1d(
6949
  struct ggml_context * ctx,
6950
  struct ggml_tensor * a,
6951
  enum ggml_op_pool op,
@@ -6964,10 +6984,10 @@ struct ggml_tensor* ggml_pool_1d(
6964
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
6965
  a->ne[1],
6966
  };
6967
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6968
 
6969
  int32_t params[] = { op, k0, s0, p0 };
6970
- ggml_set_op_params(result, &params, sizeof(params));
6971
 
6972
  result->op = GGML_OP_POOL_1D;
6973
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6978,7 +6998,7 @@ struct ggml_tensor* ggml_pool_1d(
6978
 
6979
  // ggml_pool_2d
6980
 
6981
- struct ggml_tensor* ggml_pool_2d(
6982
  struct ggml_context * ctx,
6983
  struct ggml_tensor * a,
6984
  enum ggml_op_pool op,
@@ -7001,10 +7021,10 @@ struct ggml_tensor* ggml_pool_2d(
7001
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
7002
  a->ne[2],
7003
  };
7004
- struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7005
 
7006
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7007
- ggml_set_op_params(result, &params, sizeof(params));
7008
 
7009
  result->op = GGML_OP_POOL_2D;
7010
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7172,7 +7192,7 @@ struct ggml_tensor * ggml_win_part(
7172
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7173
 
7174
  int32_t params[] = { npx, npy, w };
7175
- ggml_set_op_params(result, &params, sizeof(params));
7176
 
7177
  result->op = GGML_OP_WIN_PART;
7178
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7202,7 +7222,7 @@ struct ggml_tensor * ggml_win_unpart(
7202
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7203
 
7204
  int32_t params[] = { w };
7205
- ggml_set_op_params(result, &params, sizeof(params));
7206
 
7207
  result->op = GGML_OP_WIN_UNPART;
7208
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7226,7 +7246,7 @@ static struct ggml_tensor * ggml_unary_impl(
7226
 
7227
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7228
 
7229
- ggml_set_unary_op(result, op);
7230
 
7231
  result->op = GGML_OP_UNARY;
7232
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7331,7 +7351,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7331
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
7332
  }
7333
 
7334
- // ggml_map_custom1
7335
 
7336
  static struct ggml_tensor * ggml_map_custom1_impl_f32(
7337
  struct ggml_context * ctx,
@@ -7348,7 +7368,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
7348
 
7349
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7350
 
7351
- result->op = GGML_OP_MAP_CUSTOM1;
7352
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7353
  result->src[0] = a;
7354
 
@@ -7369,7 +7389,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7369
  return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7370
  }
7371
 
7372
- // ggml_map_custom2
7373
 
7374
  static struct ggml_tensor * ggml_map_custom2_impl_f32(
7375
  struct ggml_context * ctx,
@@ -7387,7 +7407,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
7387
 
7388
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7389
 
7390
- result->op = GGML_OP_MAP_CUSTOM2;
7391
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7392
  result->src[0] = a;
7393
  result->src[1] = b;
@@ -7411,7 +7431,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7411
  return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7412
  }
7413
 
7414
- // ggml_map_custom3
7415
 
7416
  static struct ggml_tensor * ggml_map_custom3_impl_f32(
7417
  struct ggml_context * ctx,
@@ -7430,7 +7450,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
7430
 
7431
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7432
 
7433
- result->op = GGML_OP_MAP_CUSTOM3;
7434
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7435
  result->src[0] = a;
7436
  result->src[1] = b;
@@ -7457,6 +7477,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
7457
  return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7458
  }
7459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7460
  // ggml_cross_entropy_loss
7461
 
7462
  struct ggml_tensor * ggml_cross_entropy_loss(
@@ -9265,8 +9469,8 @@ static void ggml_compute_forward_sum_rows_f32(
9265
  for (int64_t i3 = 0; i3 < ne03; i3++) {
9266
  for (int64_t i2 = 0; i2 < ne02; i2++) {
9267
  for (int64_t i1 = 0; i1 < ne01; i1++) {
9268
- float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
9269
- float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
9270
  float row_sum = 0;
9271
  ggml_vec_sum_f32(ne00, &row_sum, src_row);
9272
  dst_row[0] = row_sum;
@@ -10523,71 +10727,95 @@ static void ggml_compute_forward_mul_mat(
10523
  return;
10524
  }
10525
 
10526
- // parallelize by src0 rows
10527
- const int64_t dr = (ne01 + nth - 1)/nth;
10528
 
10529
- const int64_t ir10 = dr*ith;
10530
- const int64_t ir11 = MIN(ir10 + dr, ne01);
10531
 
10532
- // src1 rows
10533
- const int64_t nr1 = ne11*ne12*ne13;
10534
 
10535
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10536
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10537
 
10538
- for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
10539
- const int64_t i13 = (ir1/(ne12*ne11));
10540
- const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
10541
- const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
10542
-
10543
- const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
10544
- const int64_t i03 = (ir0/(ne02));
10545
- // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
10546
- // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
10547
- // GG: this is likely the correct way to broadcast, though need some more thought
10548
- // therefore leaving the comments to remind us for now
10549
- const int64_t i02 = (i12 / (ne12 / ne02));
10550
- // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
10551
- // const int64_t i02 = (ir0 - i03*ne02);
10552
-
10553
- const int64_t i1 = i11;
10554
- const int64_t i2 = i12;
10555
- const int64_t i3 = i13;
10556
-
10557
- const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
10558
-
10559
- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10560
- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10561
- // the original src1 data pointer, so we should index using the indices directly
10562
- // TODO: this is a bit of a hack, we should probably have a better way to handle this
10563
- const char * src1_col = (const char *) wdata +
10564
- (src1_cont || src1->type != vec_dot_type
10565
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10566
- : (i11*nb11 + i12*nb12 + i13*nb13));
10567
-
10568
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10569
-
10570
- for (int64_t ir = ir10; ir < ir11; ++ir) {
10571
- vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
10572
- }
10573
  }
10574
 
10575
- //int64_t t1 = ggml_time_us();
10576
- //static int64_t acc = 0;
10577
- //acc += t1 - t0;
10578
- //if (t1 - t0 > 10) {
10579
- // printf("\n");
10580
- // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
10581
- // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
10582
- // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
10583
 
10584
- // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
10585
- //}
10586
- }
10587
 
 
 
 
10588
 
10589
- // ggml_compute_forward_out_prod
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10590
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10591
 
10592
  static void ggml_compute_forward_out_prod_f32(
10593
  const struct ggml_compute_params * params,
@@ -12871,7 +13099,7 @@ static void ggml_compute_forward_pool_1d(
12871
  const struct ggml_tensor * src0,
12872
  struct ggml_tensor * dst) {
12873
 
12874
- const int32_t* opts = (const int32_t*)dst->op_params;
12875
  enum ggml_op_pool op = opts[0];
12876
  const int k0 = opts[1];
12877
  const int s0 = opts[2];
@@ -14204,24 +14432,6 @@ static void ggml_compute_forward_map_custom1_f32(
14204
  fun(dst, a);
14205
  }
14206
 
14207
-
14208
- static void ggml_compute_forward_map_custom1(
14209
- const struct ggml_compute_params * params,
14210
- const struct ggml_tensor * a,
14211
- struct ggml_tensor * dst,
14212
- const ggml_custom1_op_f32_t fun) {
14213
- switch (a->type) {
14214
- case GGML_TYPE_F32:
14215
- {
14216
- ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
14217
- } break;
14218
- default:
14219
- {
14220
- GGML_ASSERT(false);
14221
- } break;
14222
- }
14223
- }
14224
-
14225
  // ggml_compute_forward_map_custom2
14226
 
14227
  static void ggml_compute_forward_map_custom2_f32(
@@ -14240,24 +14450,6 @@ static void ggml_compute_forward_map_custom2_f32(
14240
  }
14241
 
14242
 
14243
- static void ggml_compute_forward_map_custom2(
14244
- const struct ggml_compute_params * params,
14245
- const struct ggml_tensor * a,
14246
- const struct ggml_tensor * b,
14247
- struct ggml_tensor * dst,
14248
- const ggml_custom2_op_f32_t fun) {
14249
- switch (a->type) {
14250
- case GGML_TYPE_F32:
14251
- {
14252
- ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
14253
- } break;
14254
- default:
14255
- {
14256
- GGML_ASSERT(false);
14257
- } break;
14258
- }
14259
- }
14260
-
14261
  // ggml_compute_forward_map_custom3
14262
 
14263
  static void ggml_compute_forward_map_custom3_f32(
@@ -14276,24 +14468,52 @@ static void ggml_compute_forward_map_custom3_f32(
14276
  fun(dst, a, b, c);
14277
  }
14278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14279
 
14280
  static void ggml_compute_forward_map_custom3(
14281
  const struct ggml_compute_params * params,
14282
  const struct ggml_tensor * a,
14283
  const struct ggml_tensor * b,
14284
  const struct ggml_tensor * c,
14285
- struct ggml_tensor * dst,
14286
- const ggml_custom3_op_f32_t fun) {
14287
- switch (a->type) {
14288
- case GGML_TYPE_F32:
14289
- {
14290
- ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
14291
- } break;
14292
- default:
14293
- {
14294
- GGML_ASSERT(false);
14295
- } break;
14296
  }
 
 
 
 
14297
  }
14298
 
14299
  // ggml_compute_forward_cross_entropy_loss
@@ -14815,25 +15035,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14815
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
14816
  }
14817
  break;
14818
- case GGML_OP_MAP_CUSTOM1:
14819
  {
14820
  ggml_custom1_op_f32_t fun;
14821
  memcpy(&fun, tensor->op_params, sizeof(fun));
14822
- ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
14823
  }
14824
  break;
14825
- case GGML_OP_MAP_CUSTOM2:
14826
  {
14827
  ggml_custom2_op_f32_t fun;
14828
  memcpy(&fun, tensor->op_params, sizeof(fun));
14829
- ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
14830
  }
14831
  break;
14832
- case GGML_OP_MAP_CUSTOM3:
14833
  {
14834
  ggml_custom3_op_f32_t fun;
14835
  memcpy(&fun, tensor->op_params, sizeof(fun));
14836
- ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14837
  }
14838
  break;
14839
  case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -15641,6 +15876,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15641
  } break;
15642
  case GGML_OP_MAP_UNARY:
15643
  case GGML_OP_MAP_BINARY:
 
 
 
15644
  case GGML_OP_MAP_CUSTOM1:
15645
  case GGML_OP_MAP_CUSTOM2:
15646
  case GGML_OP_MAP_CUSTOM3:
@@ -15825,6 +16063,35 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
15825
  return result;
15826
  }
15827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15828
  //
15829
  // thread data
15830
  //
@@ -16401,11 +16668,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16401
  case GGML_OP_WIN_UNPART:
16402
  case GGML_OP_MAP_UNARY:
16403
  case GGML_OP_MAP_BINARY:
 
 
 
 
 
 
16404
  case GGML_OP_MAP_CUSTOM1:
 
 
 
 
 
 
 
 
16405
  case GGML_OP_MAP_CUSTOM2:
 
 
 
 
 
 
 
 
16406
  case GGML_OP_MAP_CUSTOM3:
16407
  {
16408
- n_tasks = 1;
 
 
 
 
 
16409
  } break;
16410
  case GGML_OP_CROSS_ENTROPY_LOSS:
16411
  {
@@ -16544,10 +16838,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16544
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16545
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16546
 
16547
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
16548
- GGML_ASSERT(buf);
16549
 
16550
- cplan.work_data = buf->data;
16551
 
16552
  ggml_graph_compute(cgraph, &cplan);
16553
  }
 
195
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
196
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
197
  #else
198
+ inline static void * ggml_aligned_malloc(size_t size) {
199
+ void * aligned_memory = NULL;
200
  #ifdef GGML_USE_METAL
201
  int result = posix_memalign(&aligned_memory, getpagesize(), size);
202
  #else
 
3812
  "CROSS_ENTROPY_LOSS_BACK",
3813
  };
3814
 
3815
+ static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3816
 
3817
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3818
  "none",
 
3884
  "cross_entropy_loss_back(x,y)",
3885
  };
3886
 
3887
+ static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3888
 
3889
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3890
 
 
4072
  ////////////////////////////////////////////////////////////////////////////////
4073
 
4074
  void ggml_print_object(const struct ggml_object * obj) {
4075
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
4076
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
4077
  }
4078
 
4079
  void ggml_print_objects(const struct ggml_context * ctx) {
 
4111
  //
4112
  // is enough, but just in case, adding the second part
4113
 
4114
+ return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
4115
  }
4116
 
4117
  size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
 
4213
  }
4214
 
4215
  size_t ggml_tensor_overhead(void) {
4216
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
4217
  }
4218
 
4219
  bool ggml_is_transposed(const struct ggml_tensor * tensor) {
 
4254
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4255
  }
4256
 
4257
+ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
4258
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4259
 
4260
  return
 
4384
  return NULL;
4385
  }
4386
 
4387
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4388
 
4389
  *ctx = (struct ggml_context) {
4390
  /*.mem_size =*/ mem_size,
 
4473
  struct ggml_object * obj = ctx->objects_begin;
4474
 
4475
  while (obj != NULL) {
4476
+ if (obj->type == GGML_OBJECT_TENSOR) {
4477
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4478
 
4479
+ const size_t size = ggml_nbytes(tensor);
4480
 
4481
+ if (max_size < size) {
4482
+ max_size = size;
4483
+ }
4484
  }
4485
 
4486
  obj = obj->next;
 
4512
 
4513
  ////////////////////////////////////////////////////////////////////////////////
4514
 
4515
+ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
 
 
 
 
 
4516
  // always insert objects at the end of the context's memory pool
4517
  struct ggml_object * obj_cur = ctx->objects_end;
4518
 
 
4520
  const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
4521
  const size_t cur_end = cur_offs + cur_size;
4522
 
4523
+ // align to GGML_MEM_ALIGN
4524
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
 
 
 
 
 
 
 
 
4525
 
4526
  char * const mem_buffer = ctx->mem_buffer;
4527
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4528
 
4529
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4530
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4531
+ __func__, cur_end + size_needed, ctx->mem_size);
4532
+ assert(false);
4533
+ return NULL;
4534
+ }
4535
 
4536
+ *obj_new = (struct ggml_object) {
4537
+ .offs = cur_end + GGML_OBJECT_SIZE,
4538
+ .size = size_needed,
4539
+ .next = NULL,
4540
+ .type = type,
4541
+ };
4542
 
4543
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
4544
+
4545
+ if (obj_cur != NULL) {
4546
+ obj_cur->next = obj_new;
 
4547
  } else {
4548
+ // this is the first object in this context
4549
+ ctx->objects_begin = obj_new;
4550
+ }
4551
+
4552
+ ctx->objects_end = obj_new;
4553
+
4554
+ //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
4555
+
4556
+ return obj_new;
4557
+ }
4558
+
4559
+ static struct ggml_tensor * ggml_new_tensor_impl(
4560
+ struct ggml_context * ctx,
4561
+ enum ggml_type type,
4562
+ int n_dims,
4563
+ const int64_t * ne,
4564
+ void * data) {
4565
+
4566
+ assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
4567
+
4568
+ size_t data_size = 0;
4569
+
4570
+ if (data == NULL && !ctx->no_alloc) {
4571
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4572
+ for (int i = 1; i < n_dims; i++) {
4573
+ data_size *= ne[i];
4574
  }
4575
+ }
4576
 
4577
+ if (ctx->scratch.data != NULL && data == NULL) {
4578
+ // allocate tensor data in the scratch buffer
4579
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
4580
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4581
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
4582
  assert(false);
4583
  return NULL;
4584
  }
4585
 
4586
  data = (char * const) ctx->scratch.data + ctx->scratch.offs;
4587
 
4588
+ ctx->scratch.offs += data_size;
 
 
 
 
 
 
4589
 
4590
+ data_size = 0;
4591
  }
4592
 
4593
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
 
 
 
 
 
 
 
4594
 
4595
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
 
 
4596
 
4597
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
4598
 
4599
  *result = (struct ggml_tensor) {
4600
  /*.type =*/ type,
 
4603
  /*.ne =*/ { 1, 1, 1, 1 },
4604
  /*.nb =*/ { 0, 0, 0, 0 },
4605
  /*.op =*/ GGML_OP_NONE,
4606
+ /*.op_params =*/ { 0 },
4607
  /*.is_param =*/ false,
4608
  /*.grad =*/ NULL,
4609
  /*.src =*/ { NULL },
 
4635
  }
4636
 
4637
  static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
4638
+ GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
4639
  assert(params_size <= GGML_MAX_OP_PARAMS);
4640
  memcpy(tensor->op_params, params, params_size);
4641
  }
 
4652
 
4653
  struct ggml_tensor * ggml_new_tensor(
4654
  struct ggml_context * ctx,
4655
+ enum ggml_type type,
4656
+ int n_dims,
4657
+ const int64_t * ne) {
4658
  return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
4659
  }
4660
 
4661
  struct ggml_tensor * ggml_new_tensor_1d(
4662
  struct ggml_context * ctx,
4663
+ enum ggml_type type,
4664
  int64_t ne0) {
4665
  return ggml_new_tensor(ctx, type, 1, &ne0);
4666
  }
4667
 
4668
  struct ggml_tensor * ggml_new_tensor_2d(
4669
  struct ggml_context * ctx,
4670
+ enum ggml_type type,
4671
  int64_t ne0,
4672
  int64_t ne1) {
4673
  const int64_t ne[2] = { ne0, ne1 };
 
4676
 
4677
  struct ggml_tensor * ggml_new_tensor_3d(
4678
  struct ggml_context * ctx,
4679
+ enum ggml_type type,
4680
  int64_t ne0,
4681
  int64_t ne1,
4682
  int64_t ne2) {
 
4986
  return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
4987
  }
4988
 
 
 
 
 
 
4989
  const char * ggml_get_name(const struct ggml_tensor * tensor) {
4990
  return tensor->name;
4991
  }
 
5024
  char * const mem_buffer = ctx->mem_buffer;
5025
 
5026
  while (obj != NULL) {
5027
+ if (obj->type == GGML_OBJECT_TENSOR) {
5028
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
5029
+ if (strcmp(cur->name, name) == 0) {
5030
+ return cur;
5031
+ }
5032
  }
5033
 
5034
  obj = obj->next;
 
6242
 
6243
  // ggml_view_1d
6244
 
6245
+ static struct ggml_tensor * ggml_view_tensor_offset(
6246
+ struct ggml_context * ctx,
6247
+ struct ggml_tensor * a,
6248
+ int n_dims,
6249
+ const int64_t * ne,
6250
+ size_t offset) {
6251
+ // don't calculate an offset from an unallocated tensor
6252
+ void * data = NULL;
6253
+ if (a->data != NULL) {
6254
+ data = (char *) a->data + offset;
6255
+ }
6256
+
6257
+ struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
6258
+
6259
+ ggml_format_name(result, "%s (view)", a->name);
6260
+
6261
+ ggml_set_op_params(result, &offset, sizeof(offset));
6262
+
6263
+ return result;
6264
+ }
6265
+
6266
  struct ggml_tensor * ggml_view_1d(
6267
  struct ggml_context * ctx,
6268
  struct ggml_tensor * a,
 
6275
  is_node = true;
6276
  }
6277
 
6278
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
 
 
 
6279
 
6280
  result->op = GGML_OP_VIEW;
6281
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6302
 
6303
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6304
 
6305
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
 
 
 
6306
 
6307
  result->nb[1] = nb1;
6308
  result->nb[2] = result->nb[1]*ne1;
 
6335
 
6336
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6337
 
6338
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
 
 
 
6339
 
6340
  result->nb[1] = nb1;
6341
  result->nb[2] = nb2;
 
6370
 
6371
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6372
 
6373
+ struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
 
 
 
6374
 
6375
  result->nb[1] = nb1;
6376
  result->nb[2] = nb2;
 
6441
  result->src[0] = a;
6442
 
6443
  int32_t params[] = { axis0, axis1, axis2, axis3 };
6444
+ ggml_set_op_params(result, params, sizeof(params));
6445
 
6446
  return result;
6447
  }
 
6567
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6568
 
6569
  int32_t params[] = { n_past, inplace ? 1 : 0 };
6570
+ ggml_set_op_params(result, params, sizeof(params));
6571
 
6572
  result->op = GGML_OP_DIAG_MASK_INF;
6573
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6607
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6608
 
6609
  int32_t params[] = { n_past, inplace ? 1 : 0 };
6610
+ ggml_set_op_params(result, params, sizeof(params));
6611
 
6612
  result->op = GGML_OP_DIAG_MASK_ZERO;
6613
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6723
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6724
 
6725
  int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6726
+ memcpy(params + 4, &freq_base, sizeof(float));
6727
  memcpy(params + 5, &freq_scale, sizeof(float));
6728
+ ggml_set_op_params(result, params, sizeof(params));
6729
 
6730
  result->op = GGML_OP_ROPE;
6731
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6754
  return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
6755
  }
6756
 
6757
+ struct ggml_tensor * ggml_rope_custom(
6758
+ struct ggml_context * ctx,
6759
+ struct ggml_tensor * a,
6760
+ int n_past,
6761
+ int n_dims,
6762
+ int mode,
6763
+ int n_ctx,
6764
+ float freq_base,
6765
+ float freq_scale) {
6766
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
6767
+ }
6768
+
6769
  struct ggml_tensor * ggml_rope_custom_inplace(
6770
  struct ggml_context * ctx,
6771
  struct ggml_tensor * a,
 
6799
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6800
 
6801
  int32_t params[] = { n_past, n_dims, mode, n_ctx };
6802
+ ggml_set_op_params(result, params, sizeof(params));
6803
 
6804
  result->op = GGML_OP_ROPE_BACK;
6805
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6830
 
6831
  int32_t op_params[3] = { n_past, n_head };
6832
  memcpy(op_params + 2, &bias_max, sizeof(float));
6833
+ ggml_set_op_params(result, op_params, sizeof(op_params));
6834
 
6835
  result->op = GGML_OP_ALIBI;
6836
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6857
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6858
 
6859
  float params[] = { min, max };
6860
+ ggml_set_op_params(result, params, sizeof(params));
6861
 
6862
  result->op = GGML_OP_CLAMP;
6863
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6892
  ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
6893
  a->ne[2], 1, 1,
6894
  };
6895
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6896
 
6897
  int32_t params[] = { s0, p0, d0 };
6898
+ ggml_set_op_params(result, params, sizeof(params));
6899
 
6900
  result->op = GGML_OP_CONV_1D;
6901
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6907
 
6908
  // ggml_conv_2d
6909
 
6910
+ struct ggml_tensor * ggml_conv_2d(
6911
+ struct ggml_context * ctx,
6912
+ struct ggml_tensor * a,
6913
+ struct ggml_tensor * b,
6914
  int s0,
6915
  int s1,
6916
  int p0,
 
6931
  ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
6932
  a->ne[3], b->ne[3],
6933
  };
6934
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6935
 
6936
  int32_t params[] = { s0, s1, p0, p1, d0, d1 };
6937
+ ggml_set_op_params(result, params, sizeof(params));
6938
 
6939
  result->op = GGML_OP_CONV_2D;
6940
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6947
 
6948
  // ggml_conv_1d_ph
6949
 
6950
+ struct ggml_tensor * ggml_conv_1d_ph(
6951
  struct ggml_context * ctx,
6952
  struct ggml_tensor * a,
6953
  struct ggml_tensor * b,
 
6965
 
6966
  // ggml_pool_1d
6967
 
6968
+ struct ggml_tensor * ggml_pool_1d(
6969
  struct ggml_context * ctx,
6970
  struct ggml_tensor * a,
6971
  enum ggml_op_pool op,
 
6984
  ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
6985
  a->ne[1],
6986
  };
6987
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6988
 
6989
  int32_t params[] = { op, k0, s0, p0 };
6990
+ ggml_set_op_params(result, params, sizeof(params));
6991
 
6992
  result->op = GGML_OP_POOL_1D;
6993
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6998
 
6999
  // ggml_pool_2d
7000
 
7001
+ struct ggml_tensor * ggml_pool_2d(
7002
  struct ggml_context * ctx,
7003
  struct ggml_tensor * a,
7004
  enum ggml_op_pool op,
 
7021
  ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
7022
  a->ne[2],
7023
  };
7024
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7025
 
7026
  int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
7027
+ ggml_set_op_params(result, params, sizeof(params));
7028
 
7029
  result->op = GGML_OP_POOL_2D;
7030
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
7192
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7193
 
7194
  int32_t params[] = { npx, npy, w };
7195
+ ggml_set_op_params(result, params, sizeof(params));
7196
 
7197
  result->op = GGML_OP_WIN_PART;
7198
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
7222
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7223
 
7224
  int32_t params[] = { w };
7225
+ ggml_set_op_params(result, params, sizeof(params));
7226
 
7227
  result->op = GGML_OP_WIN_UNPART;
7228
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
7246
 
7247
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7248
 
7249
+ ggml_set_op_params_i32(result, 0, (int32_t) op);
7250
 
7251
  result->op = GGML_OP_UNARY;
7252
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
7351
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
7352
  }
7353
 
7354
+ // ggml_map_custom1_f32
7355
 
7356
  static struct ggml_tensor * ggml_map_custom1_impl_f32(
7357
  struct ggml_context * ctx,
 
7368
 
7369
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7370
 
7371
+ result->op = GGML_OP_MAP_CUSTOM1_F32;
7372
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7373
  result->src[0] = a;
7374
 
 
7389
  return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7390
  }
7391
 
7392
+ // ggml_map_custom2_f32
7393
 
7394
  static struct ggml_tensor * ggml_map_custom2_impl_f32(
7395
  struct ggml_context * ctx,
 
7407
 
7408
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7409
 
7410
+ result->op = GGML_OP_MAP_CUSTOM2_F32;
7411
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7412
  result->src[0] = a;
7413
  result->src[1] = b;
 
7431
  return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7432
  }
7433
 
7434
+ // ggml_map_custom3_f32
7435
 
7436
  static struct ggml_tensor * ggml_map_custom3_impl_f32(
7437
  struct ggml_context * ctx,
 
7450
 
7451
  ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
7452
 
7453
+ result->op = GGML_OP_MAP_CUSTOM3_F32;
7454
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7455
  result->src[0] = a;
7456
  result->src[1] = b;
 
7477
  return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7478
  }
7479
 
7480
+ // ggml_map_custom1
7481
+ struct ggml_map_custom1_op_params {
7482
+ ggml_custom1_op_t fun;
7483
+ int n_tasks;
7484
+ void * userdata;
7485
+ };
7486
+
7487
+ static struct ggml_tensor * ggml_map_custom1_impl(
7488
+ struct ggml_context * ctx,
7489
+ struct ggml_tensor * a,
7490
+ const ggml_custom1_op_t fun,
7491
+ int n_tasks,
7492
+ void * userdata,
7493
+ bool inplace) {
7494
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7495
+
7496
+ bool is_node = false;
7497
+
7498
+ if (!inplace && a->grad) {
7499
+ is_node = true;
7500
+ }
7501
+
7502
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7503
+
7504
+ struct ggml_map_custom1_op_params params = {
7505
+ /*.fun =*/ fun,
7506
+ /*.n_tasks =*/ n_tasks,
7507
+ /*.userdata =*/ userdata
7508
+ };
7509
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7510
+
7511
+ result->op = GGML_OP_MAP_CUSTOM1;
7512
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7513
+ result->src[0] = a;
7514
+
7515
+ return result;
7516
+ }
7517
+
7518
+ struct ggml_tensor * ggml_map_custom1(
7519
+ struct ggml_context * ctx,
7520
+ struct ggml_tensor * a,
7521
+ const ggml_custom1_op_t fun,
7522
+ int n_tasks,
7523
+ void * userdata) {
7524
+ return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
7525
+ }
7526
+
7527
+ struct ggml_tensor * ggml_map_custom1_inplace(
7528
+ struct ggml_context * ctx,
7529
+ struct ggml_tensor * a,
7530
+ const ggml_custom1_op_t fun,
7531
+ int n_tasks,
7532
+ void * userdata) {
7533
+ return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
7534
+ }
7535
+
7536
+ // ggml_map_custom2
7537
+
7538
+ struct ggml_map_custom2_op_params {
7539
+ ggml_custom2_op_t fun;
7540
+ int n_tasks;
7541
+ void * userdata;
7542
+ };
7543
+
7544
+ static struct ggml_tensor * ggml_map_custom2_impl(
7545
+ struct ggml_context * ctx,
7546
+ struct ggml_tensor * a,
7547
+ struct ggml_tensor * b,
7548
+ const ggml_custom2_op_t fun,
7549
+ int n_tasks,
7550
+ void * userdata,
7551
+ bool inplace) {
7552
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7553
+
7554
+ bool is_node = false;
7555
+
7556
+ if (!inplace && (a->grad || b->grad)) {
7557
+ is_node = true;
7558
+ }
7559
+
7560
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7561
+
7562
+ struct ggml_map_custom2_op_params params = {
7563
+ /*.fun =*/ fun,
7564
+ /*.n_tasks =*/ n_tasks,
7565
+ /*.userdata =*/ userdata
7566
+ };
7567
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7568
+
7569
+ result->op = GGML_OP_MAP_CUSTOM2;
7570
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7571
+ result->src[0] = a;
7572
+ result->src[1] = b;
7573
+
7574
+ return result;
7575
+ }
7576
+
7577
+ struct ggml_tensor * ggml_map_custom2(
7578
+ struct ggml_context * ctx,
7579
+ struct ggml_tensor * a,
7580
+ struct ggml_tensor * b,
7581
+ const ggml_custom2_op_t fun,
7582
+ int n_tasks,
7583
+ void * userdata) {
7584
+ return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
7585
+ }
7586
+
7587
+ struct ggml_tensor * ggml_map_custom2_inplace(
7588
+ struct ggml_context * ctx,
7589
+ struct ggml_tensor * a,
7590
+ struct ggml_tensor * b,
7591
+ const ggml_custom2_op_t fun,
7592
+ int n_tasks,
7593
+ void * userdata) {
7594
+ return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
7595
+ }
7596
+
7597
+ // ggml_map_custom3
7598
+
7599
+ struct ggml_map_custom3_op_params {
7600
+ ggml_custom3_op_t fun;
7601
+ int n_tasks;
7602
+ void * userdata;
7603
+ };
7604
+
7605
+ static struct ggml_tensor * ggml_map_custom3_impl(
7606
+ struct ggml_context * ctx,
7607
+ struct ggml_tensor * a,
7608
+ struct ggml_tensor * b,
7609
+ struct ggml_tensor * c,
7610
+ const ggml_custom3_op_t fun,
7611
+ int n_tasks,
7612
+ void * userdata,
7613
+ bool inplace) {
7614
+ GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
7615
+
7616
+ bool is_node = false;
7617
+
7618
+ if (!inplace && (a->grad || b->grad || c->grad)) {
7619
+ is_node = true;
7620
+ }
7621
+
7622
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7623
+
7624
+ struct ggml_map_custom3_op_params params = {
7625
+ /*.fun =*/ fun,
7626
+ /*.n_tasks =*/ n_tasks,
7627
+ /*.userdata =*/ userdata
7628
+ };
7629
+ ggml_set_op_params(result, (const void *) &params, sizeof(params));
7630
+
7631
+ result->op = GGML_OP_MAP_CUSTOM3;
7632
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7633
+ result->src[0] = a;
7634
+ result->src[1] = b;
7635
+ result->src[2] = c;
7636
+
7637
+ return result;
7638
+ }
7639
+
7640
+ struct ggml_tensor * ggml_map_custom3(
7641
+ struct ggml_context * ctx,
7642
+ struct ggml_tensor * a,
7643
+ struct ggml_tensor * b,
7644
+ struct ggml_tensor * c,
7645
+ const ggml_custom3_op_t fun,
7646
+ int n_tasks,
7647
+ void * userdata) {
7648
+ return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
7649
+ }
7650
+
7651
+ struct ggml_tensor * ggml_map_custom3_inplace(
7652
+ struct ggml_context * ctx,
7653
+ struct ggml_tensor * a,
7654
+ struct ggml_tensor * b,
7655
+ struct ggml_tensor * c,
7656
+ const ggml_custom3_op_t fun,
7657
+ int n_tasks,
7658
+ void * userdata) {
7659
+ return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
7660
+ }
7661
+
7662
+
7663
+
7664
  // ggml_cross_entropy_loss
7665
 
7666
  struct ggml_tensor * ggml_cross_entropy_loss(
 
9469
  for (int64_t i3 = 0; i3 < ne03; i3++) {
9470
  for (int64_t i2 = 0; i2 < ne02; i2++) {
9471
  for (int64_t i1 = 0; i1 < ne01; i1++) {
9472
+ float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
9473
+ float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
9474
  float row_sum = 0;
9475
  ggml_vec_sum_f32(ne00, &row_sum, src_row);
9476
  dst_row[0] = row_sum;
 
10727
  return;
10728
  }
10729
 
10730
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10731
+ const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
10732
 
10733
+ const int64_t nr0 = ne01; // src0 rows
10734
+ const int64_t nr1 = ne11*ne12*ne13; // src1 rows
10735
 
10736
+ //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
 
10737
 
10738
+ // distribute the thread work across the inner or outer loop based on which one is larger
 
10739
 
10740
+ const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
10741
+ const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
10742
+
10743
+ const int64_t ith0 = ith % nth0;
10744
+ const int64_t ith1 = ith / nth0;
10745
+
10746
+ const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
10747
+ const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
10748
+
10749
+ const int64_t ir010 = dr0*ith0;
10750
+ const int64_t ir011 = MIN(ir010 + dr0, nr0);
10751
+
10752
+ const int64_t ir110 = dr1*ith1;
10753
+ const int64_t ir111 = MIN(ir110 + dr1, nr1);
10754
+
10755
+ //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
10756
+
10757
+ // threads with no work simply yield (not sure if it helps)
10758
+ if (ir010 >= ir011 || ir110 >= ir111) {
10759
+ sched_yield();
10760
+ return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10761
  }
10762
 
10763
+ assert(ne12 % ne02 == 0);
10764
+ assert(ne13 % ne03 == 0);
 
 
 
 
 
 
10765
 
10766
+ // broadcast factors
10767
+ const int64_t r2 = ne12/ne02;
10768
+ const int64_t r3 = ne13/ne03;
10769
 
10770
+ // block-tiling attempt
10771
+ const int64_t blck_0 = 16;
10772
+ const int64_t blck_1 = 16;
10773
 
10774
+ // attempt to reduce false-sharing (does not seem to make a difference)
10775
+ float tmp[16];
10776
+
10777
+ for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
10778
+ for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
10779
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
10780
+ const int64_t i13 = (ir1/(ne12*ne11));
10781
+ const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
10782
+ const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
10783
+
10784
+ // broadcast src0 into src1
10785
+ const int64_t i03 = i13/r3;
10786
+ const int64_t i02 = i12/r2;
10787
+
10788
+ const int64_t i1 = i11;
10789
+ const int64_t i2 = i12;
10790
+ const int64_t i3 = i13;
10791
+
10792
+ const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
10793
 
10794
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
10795
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
10796
+ // the original src1 data pointer, so we should index using the indices directly
10797
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
10798
+ const char * src1_col = (const char *) wdata +
10799
+ (src1_cont || src1->type != vec_dot_type
10800
+ ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
10801
+ : (i11*nb11 + i12*nb12 + i13*nb13));
10802
+
10803
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
10804
+
10805
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10806
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
10807
+ //}
10808
+
10809
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
10810
+ vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
10811
+ }
10812
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
10813
+ }
10814
+ }
10815
+ }
10816
+ }
10817
+
10818
+ // ggml_compute_forward_out_prod
10819
 
10820
  static void ggml_compute_forward_out_prod_f32(
10821
  const struct ggml_compute_params * params,
 
13099
  const struct ggml_tensor * src0,
13100
  struct ggml_tensor * dst) {
13101
 
13102
+ const int32_t * opts = (const int32_t *)dst->op_params;
13103
  enum ggml_op_pool op = opts[0];
13104
  const int k0 = opts[1];
13105
  const int s0 = opts[2];
 
14432
  fun(dst, a);
14433
  }
14434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14435
  // ggml_compute_forward_map_custom2
14436
 
14437
  static void ggml_compute_forward_map_custom2_f32(
 
14450
  }
14451
 
14452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14453
  // ggml_compute_forward_map_custom3
14454
 
14455
  static void ggml_compute_forward_map_custom3_f32(
 
14468
  fun(dst, a, b, c);
14469
  }
14470
 
14471
+ // ggml_compute_forward_map_custom1
14472
+
14473
+ static void ggml_compute_forward_map_custom1(
14474
+ const struct ggml_compute_params * params,
14475
+ const struct ggml_tensor * a,
14476
+ struct ggml_tensor * dst) {
14477
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14478
+ return;
14479
+ }
14480
+
14481
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
14482
+
14483
+ p->fun(dst, a, params->ith, params->nth, p->userdata);
14484
+ }
14485
+
14486
+ // ggml_compute_forward_map_custom2
14487
+
14488
+ static void ggml_compute_forward_map_custom2(
14489
+ const struct ggml_compute_params * params,
14490
+ const struct ggml_tensor * a,
14491
+ const struct ggml_tensor * b,
14492
+ struct ggml_tensor * dst) {
14493
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14494
+ return;
14495
+ }
14496
+
14497
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
14498
+
14499
+ p->fun(dst, a, b, params->ith, params->nth, p->userdata);
14500
+ }
14501
+
14502
+ // ggml_compute_forward_map_custom3
14503
 
14504
  static void ggml_compute_forward_map_custom3(
14505
  const struct ggml_compute_params * params,
14506
  const struct ggml_tensor * a,
14507
  const struct ggml_tensor * b,
14508
  const struct ggml_tensor * c,
14509
+ struct ggml_tensor * dst) {
14510
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14511
+ return;
 
 
 
 
 
 
 
 
14512
  }
14513
+
14514
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
14515
+
14516
+ p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
14517
  }
14518
 
14519
  // ggml_compute_forward_cross_entropy_loss
 
15035
  ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
15036
  }
15037
  break;
15038
+ case GGML_OP_MAP_CUSTOM1_F32:
15039
  {
15040
  ggml_custom1_op_f32_t fun;
15041
  memcpy(&fun, tensor->op_params, sizeof(fun));
15042
+ ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
15043
  }
15044
  break;
15045
+ case GGML_OP_MAP_CUSTOM2_F32:
15046
  {
15047
  ggml_custom2_op_f32_t fun;
15048
  memcpy(&fun, tensor->op_params, sizeof(fun));
15049
+ ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
15050
  }
15051
  break;
15052
+ case GGML_OP_MAP_CUSTOM3_F32:
15053
  {
15054
  ggml_custom3_op_f32_t fun;
15055
  memcpy(&fun, tensor->op_params, sizeof(fun));
15056
+ ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
15057
+ }
15058
+ break;
15059
+ case GGML_OP_MAP_CUSTOM1:
15060
+ {
15061
+ ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
15062
+ }
15063
+ break;
15064
+ case GGML_OP_MAP_CUSTOM2:
15065
+ {
15066
+ ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
15067
+ }
15068
+ break;
15069
+ case GGML_OP_MAP_CUSTOM3:
15070
+ {
15071
+ ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15072
  }
15073
  break;
15074
  case GGML_OP_CROSS_ENTROPY_LOSS:
 
15876
  } break;
15877
  case GGML_OP_MAP_UNARY:
15878
  case GGML_OP_MAP_BINARY:
15879
+ case GGML_OP_MAP_CUSTOM1_F32:
15880
+ case GGML_OP_MAP_CUSTOM2_F32:
15881
+ case GGML_OP_MAP_CUSTOM3_F32:
15882
  case GGML_OP_MAP_CUSTOM1:
15883
  case GGML_OP_MAP_CUSTOM2:
15884
  case GGML_OP_MAP_CUSTOM3:
 
16063
  return result;
16064
  }
16065
 
16066
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
16067
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
16068
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
16069
+
16070
+ *cgraph = (struct ggml_cgraph) {
16071
+ /*.n_nodes =*/ 0,
16072
+ /*.n_leafs =*/ 0,
16073
+ /*.nodes =*/ { NULL },
16074
+ /*.grads =*/ { NULL },
16075
+ /*.leafs =*/ { NULL },
16076
+ /*.hash_table =*/ { NULL },
16077
+ /*.perf_runs =*/ 0,
16078
+ /*.perf_cycles =*/ 0,
16079
+ /*.perf_time_us =*/ 0,
16080
+ };
16081
+
16082
+ return cgraph;
16083
+ }
16084
+
16085
+ struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
16086
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
16087
+ ggml_build_forward_impl(cgraph, tensor, false);
16088
+ return cgraph;
16089
+ }
16090
+
16091
+ size_t ggml_graph_overhead(void) {
16092
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
16093
+ }
16094
+
16095
  //
16096
  // thread data
16097
  //
 
16668
  case GGML_OP_WIN_UNPART:
16669
  case GGML_OP_MAP_UNARY:
16670
  case GGML_OP_MAP_BINARY:
16671
+ case GGML_OP_MAP_CUSTOM1_F32:
16672
+ case GGML_OP_MAP_CUSTOM2_F32:
16673
+ case GGML_OP_MAP_CUSTOM3_F32:
16674
+ {
16675
+ n_tasks = 1;
16676
+ } break;
16677
  case GGML_OP_MAP_CUSTOM1:
16678
+ {
16679
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16680
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16681
+ n_tasks = n_threads;
16682
+ } else {
16683
+ n_tasks = MIN(p->n_tasks, n_threads);
16684
+ }
16685
+ } break;
16686
  case GGML_OP_MAP_CUSTOM2:
16687
+ {
16688
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16689
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16690
+ n_tasks = n_threads;
16691
+ } else {
16692
+ n_tasks = MIN(p->n_tasks, n_threads);
16693
+ }
16694
+ } break;
16695
  case GGML_OP_MAP_CUSTOM3:
16696
  {
16697
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16698
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
16699
+ n_tasks = n_threads;
16700
+ } else {
16701
+ n_tasks = MIN(p->n_tasks, n_threads);
16702
+ }
16703
  } break;
16704
  case GGML_OP_CROSS_ENTROPY_LOSS:
16705
  {
 
16838
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16839
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16840
 
16841
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
 
16842
 
16843
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
16844
 
16845
  ggml_graph_compute(cgraph, &cplan);
16846
  }
ggml.h CHANGED
@@ -183,6 +183,15 @@
183
  # define GGML_API
184
  #endif
185
 
 
 
 
 
 
 
 
 
 
186
  #include <stdint.h>
187
  #include <stddef.h>
188
  #include <stdbool.h>
@@ -208,6 +217,8 @@
208
 
209
  #define GGML_UNUSED(x) (void)(x)
210
 
 
 
211
  #define GGML_ASSERT(x) \
212
  do { \
213
  if (!(x)) { \
@@ -372,6 +383,10 @@ extern "C" {
372
  GGML_OP_MAP_UNARY,
373
  GGML_OP_MAP_BINARY,
374
 
 
 
 
 
375
  GGML_OP_MAP_CUSTOM1,
376
  GGML_OP_MAP_CUSTOM2,
377
  GGML_OP_MAP_CUSTOM3,
@@ -395,6 +410,12 @@ extern "C" {
395
  GGML_UNARY_OP_SILU,
396
  };
397
 
 
 
 
 
 
 
398
  // ggml object
399
  struct ggml_object {
400
  size_t offs;
@@ -402,7 +423,9 @@ extern "C" {
402
 
403
  struct ggml_object * next;
404
 
405
- char padding[8];
 
 
406
  };
407
 
408
  static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
@@ -423,7 +446,7 @@ extern "C" {
423
  enum ggml_op op;
424
 
425
  // op params - allocated as int32_t for alignment
426
- int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
427
 
428
  bool is_param;
429
 
@@ -484,6 +507,8 @@ extern "C" {
484
  int64_t perf_time_us;
485
  };
486
 
 
 
487
  // scratch buffer
488
  struct ggml_scratch {
489
  size_t offs;
@@ -558,6 +583,8 @@ extern "C" {
558
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
559
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
560
 
 
 
561
  // use this to compute the memory overhead of a tensor
562
  GGML_API size_t ggml_tensor_overhead(void);
563
 
@@ -1158,7 +1185,18 @@ extern "C" {
1158
  int mode,
1159
  int n_ctx);
1160
 
1161
- // custom RoPE, in-place, returns view(a)
 
 
 
 
 
 
 
 
 
 
 
1162
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1163
  struct ggml_context * ctx,
1164
  struct ggml_tensor * a,
@@ -1217,7 +1255,7 @@ extern "C" {
1217
 
1218
  // conv_1d with padding = half
1219
  // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1220
- GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1221
  struct ggml_context * ctx,
1222
  struct ggml_tensor * a,
1223
  struct ggml_tensor * b,
@@ -1230,7 +1268,7 @@ extern "C" {
1230
  GGML_OP_POOL_COUNT,
1231
  };
1232
 
1233
- GGML_API struct ggml_tensor* ggml_pool_1d(
1234
  struct ggml_context * ctx,
1235
  struct ggml_tensor * a,
1236
  enum ggml_op_pool op,
@@ -1238,7 +1276,7 @@ extern "C" {
1238
  int s0, // stride
1239
  int p0); // padding
1240
 
1241
- GGML_API struct ggml_tensor* ggml_pool_2d(
1242
  struct ggml_context * ctx,
1243
  struct ggml_tensor * a,
1244
  enum ggml_op_pool op,
@@ -1292,15 +1330,6 @@ extern "C" {
1292
  int h0,
1293
  int w);
1294
 
1295
- // custom operators
1296
-
1297
- typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1298
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1299
-
1300
- typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1301
- typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1302
- typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1303
-
1304
  GGML_API struct ggml_tensor * ggml_unary(
1305
  struct ggml_context * ctx,
1306
  struct ggml_tensor * a,
@@ -1311,63 +1340,138 @@ extern "C" {
1311
  struct ggml_tensor * a,
1312
  enum ggml_unary_op op);
1313
 
1314
- GGML_API struct ggml_tensor * ggml_map_unary_f32(
 
 
 
 
 
 
 
 
 
1315
  struct ggml_context * ctx,
1316
  struct ggml_tensor * a,
1317
- ggml_unary_op_f32_t fun);
 
1318
 
1319
- GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1320
  struct ggml_context * ctx,
1321
  struct ggml_tensor * a,
1322
- ggml_unary_op_f32_t fun);
 
1323
 
1324
- GGML_API struct ggml_tensor * ggml_map_binary_f32(
1325
  struct ggml_context * ctx,
1326
  struct ggml_tensor * a,
1327
  struct ggml_tensor * b,
1328
- ggml_binary_op_f32_t fun);
 
1329
 
1330
- GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1331
  struct ggml_context * ctx,
1332
  struct ggml_tensor * a,
1333
  struct ggml_tensor * b,
1334
- ggml_binary_op_f32_t fun);
 
1335
 
1336
- GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1337
  struct ggml_context * ctx,
1338
  struct ggml_tensor * a,
1339
- ggml_custom1_op_f32_t fun);
 
1340
 
1341
- GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1342
  struct ggml_context * ctx,
1343
  struct ggml_tensor * a,
1344
- ggml_custom1_op_f32_t fun);
 
1345
 
1346
- GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1347
  struct ggml_context * ctx,
1348
  struct ggml_tensor * a,
1349
  struct ggml_tensor * b,
1350
- ggml_custom2_op_f32_t fun);
 
1351
 
1352
- GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1353
  struct ggml_context * ctx,
1354
  struct ggml_tensor * a,
1355
  struct ggml_tensor * b,
1356
- ggml_custom2_op_f32_t fun);
 
1357
 
1358
- GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1359
  struct ggml_context * ctx,
1360
  struct ggml_tensor * a,
1361
  struct ggml_tensor * b,
1362
  struct ggml_tensor * c,
1363
- ggml_custom3_op_f32_t fun);
 
1364
 
1365
- GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1366
  struct ggml_context * ctx,
1367
  struct ggml_tensor * a,
1368
  struct ggml_tensor * b,
1369
  struct ggml_tensor * c,
1370
- ggml_custom3_op_f32_t fun);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1371
 
1372
  // loss function
1373
 
@@ -1390,11 +1494,17 @@ extern "C" {
1390
  struct ggml_context * ctx,
1391
  struct ggml_tensor * tensor);
1392
 
 
1393
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1394
 
1395
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1396
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1397
 
 
 
 
 
 
1398
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1399
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1400
  GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
 
183
  # define GGML_API
184
  #endif
185
 
186
+ // TODO: support for clang
187
+ #ifdef __GNUC__
188
+ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
189
+ #elif defined(_MSC_VER)
190
+ # define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
191
+ #else
192
+ # define GGML_DEPRECATED(func, hint) func
193
+ #endif
194
+
195
  #include <stdint.h>
196
  #include <stddef.h>
197
  #include <stdbool.h>
 
217
 
218
  #define GGML_UNUSED(x) (void)(x)
219
 
220
+ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
221
+
222
  #define GGML_ASSERT(x) \
223
  do { \
224
  if (!(x)) { \
 
383
  GGML_OP_MAP_UNARY,
384
  GGML_OP_MAP_BINARY,
385
 
386
+ GGML_OP_MAP_CUSTOM1_F32,
387
+ GGML_OP_MAP_CUSTOM2_F32,
388
+ GGML_OP_MAP_CUSTOM3_F32,
389
+
390
  GGML_OP_MAP_CUSTOM1,
391
  GGML_OP_MAP_CUSTOM2,
392
  GGML_OP_MAP_CUSTOM3,
 
410
  GGML_UNARY_OP_SILU,
411
  };
412
 
413
+ enum ggml_object_type {
414
+ GGML_OBJECT_TENSOR,
415
+ GGML_OBJECT_GRAPH,
416
+ GGML_OBJECT_WORK_BUFFER
417
+ };
418
+
419
  // ggml object
420
  struct ggml_object {
421
  size_t offs;
 
423
 
424
  struct ggml_object * next;
425
 
426
+ enum ggml_object_type type;
427
+
428
+ char padding[4];
429
  };
430
 
431
  static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 
446
  enum ggml_op op;
447
 
448
  // op params - allocated as int32_t for alignment
449
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
450
 
451
  bool is_param;
452
 
 
507
  int64_t perf_time_us;
508
  };
509
 
510
+ static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
511
+
512
  // scratch buffer
513
  struct ggml_scratch {
514
  size_t offs;
 
583
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
584
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
585
 
586
+ GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
587
+
588
  // use this to compute the memory overhead of a tensor
589
  GGML_API size_t ggml_tensor_overhead(void);
590
 
 
1185
  int mode,
1186
  int n_ctx);
1187
 
1188
+ // custom RoPE
1189
+ GGML_API struct ggml_tensor * ggml_rope_custom(
1190
+ struct ggml_context * ctx,
1191
+ struct ggml_tensor * a,
1192
+ int n_past,
1193
+ int n_dims,
1194
+ int mode,
1195
+ int n_ctx,
1196
+ float freq_base,
1197
+ float freq_scale);
1198
+
1199
+ // in-place, returns view(a)
1200
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1201
  struct ggml_context * ctx,
1202
  struct ggml_tensor * a,
 
1255
 
1256
  // conv_1d with padding = half
1257
  // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1258
+ GGML_API struct ggml_tensor * ggml_conv_1d_ph(
1259
  struct ggml_context * ctx,
1260
  struct ggml_tensor * a,
1261
  struct ggml_tensor * b,
 
1268
  GGML_OP_POOL_COUNT,
1269
  };
1270
 
1271
+ GGML_API struct ggml_tensor * ggml_pool_1d(
1272
  struct ggml_context * ctx,
1273
  struct ggml_tensor * a,
1274
  enum ggml_op_pool op,
 
1276
  int s0, // stride
1277
  int p0); // padding
1278
 
1279
+ GGML_API struct ggml_tensor * ggml_pool_2d(
1280
  struct ggml_context * ctx,
1281
  struct ggml_tensor * a,
1282
  enum ggml_op_pool op,
 
1330
  int h0,
1331
  int w);
1332
 
 
 
 
 
 
 
 
 
 
1333
  GGML_API struct ggml_tensor * ggml_unary(
1334
  struct ggml_context * ctx,
1335
  struct ggml_tensor * a,
 
1340
  struct ggml_tensor * a,
1341
  enum ggml_unary_op op);
1342
 
1343
+ // custom operators
1344
+
1345
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1346
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1347
+
1348
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1349
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1350
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1351
+
1352
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
1353
  struct ggml_context * ctx,
1354
  struct ggml_tensor * a,
1355
+ ggml_unary_op_f32_t fun),
1356
+ "use ggml_map_custom1 instead");
1357
 
1358
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1359
  struct ggml_context * ctx,
1360
  struct ggml_tensor * a,
1361
+ ggml_unary_op_f32_t fun),
1362
+ "use ggml_map_custom1_inplace instead");
1363
 
1364
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
1365
  struct ggml_context * ctx,
1366
  struct ggml_tensor * a,
1367
  struct ggml_tensor * b,
1368
+ ggml_binary_op_f32_t fun),
1369
+ "use ggml_map_custom2 instead");
1370
 
1371
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1372
  struct ggml_context * ctx,
1373
  struct ggml_tensor * a,
1374
  struct ggml_tensor * b,
1375
+ ggml_binary_op_f32_t fun),
1376
+ "use ggml_map_custom2_inplace instead");
1377
 
1378
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1379
  struct ggml_context * ctx,
1380
  struct ggml_tensor * a,
1381
+ ggml_custom1_op_f32_t fun),
1382
+ "use ggml_map_custom1 instead");
1383
 
1384
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1385
  struct ggml_context * ctx,
1386
  struct ggml_tensor * a,
1387
+ ggml_custom1_op_f32_t fun),
1388
+ "use ggml_map_custom1_inplace instead");
1389
 
1390
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1391
  struct ggml_context * ctx,
1392
  struct ggml_tensor * a,
1393
  struct ggml_tensor * b,
1394
+ ggml_custom2_op_f32_t fun),
1395
+ "use ggml_map_custom2 instead");
1396
 
1397
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1398
  struct ggml_context * ctx,
1399
  struct ggml_tensor * a,
1400
  struct ggml_tensor * b,
1401
+ ggml_custom2_op_f32_t fun),
1402
+ "use ggml_map_custom2_inplace instead");
1403
 
1404
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1405
  struct ggml_context * ctx,
1406
  struct ggml_tensor * a,
1407
  struct ggml_tensor * b,
1408
  struct ggml_tensor * c,
1409
+ ggml_custom3_op_f32_t fun),
1410
+ "use ggml_map_custom3 instead");
1411
 
1412
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1413
  struct ggml_context * ctx,
1414
  struct ggml_tensor * a,
1415
  struct ggml_tensor * b,
1416
  struct ggml_tensor * c,
1417
+ ggml_custom3_op_f32_t fun),
1418
+ "use ggml_map_custom3_inplace instead");
1419
+
1420
+ // custom operators v2
1421
+
1422
+ typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
1423
+ typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1424
+ typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
1425
+
1426
+ #define GGML_N_TASKS_MAX -1
1427
+
1428
+ GGML_API struct ggml_tensor * ggml_map_custom1(
1429
+ struct ggml_context * ctx,
1430
+ struct ggml_tensor * a,
1431
+ ggml_custom1_op_t fun,
1432
+ int n_tasks,
1433
+ void * userdata);
1434
+
1435
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
1436
+ struct ggml_context * ctx,
1437
+ struct ggml_tensor * a,
1438
+ ggml_custom1_op_t fun,
1439
+ int n_tasks,
1440
+ void * userdata);
1441
+
1442
+ GGML_API struct ggml_tensor * ggml_map_custom2(
1443
+ struct ggml_context * ctx,
1444
+ struct ggml_tensor * a,
1445
+ struct ggml_tensor * b,
1446
+ ggml_custom2_op_t fun,
1447
+ int n_tasks,
1448
+ void * userdata);
1449
+
1450
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
1451
+ struct ggml_context * ctx,
1452
+ struct ggml_tensor * a,
1453
+ struct ggml_tensor * b,
1454
+ ggml_custom2_op_t fun,
1455
+ int n_tasks,
1456
+ void * userdata);
1457
+
1458
+ GGML_API struct ggml_tensor * ggml_map_custom3(
1459
+ struct ggml_context * ctx,
1460
+ struct ggml_tensor * a,
1461
+ struct ggml_tensor * b,
1462
+ struct ggml_tensor * c,
1463
+ ggml_custom3_op_t fun,
1464
+ int n_tasks,
1465
+ void * userdata);
1466
+
1467
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
1468
+ struct ggml_context * ctx,
1469
+ struct ggml_tensor * a,
1470
+ struct ggml_tensor * b,
1471
+ struct ggml_tensor * c,
1472
+ ggml_custom3_op_t fun,
1473
+ int n_tasks,
1474
+ void * userdata);
1475
 
1476
  // loss function
1477
 
 
1494
  struct ggml_context * ctx,
1495
  struct ggml_tensor * tensor);
1496
 
1497
+
1498
  GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1499
 
1500
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1501
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1502
 
1503
+ // graph allocation in a context
1504
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
1505
+ GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
1506
+ GGML_API size_t ggml_graph_overhead(void);
1507
+
1508
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1509
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1510
  GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
gpttype_adapter.cpp CHANGED
@@ -243,7 +243,7 @@ void sample_temperature(llama_token_data_array * candidates_p, float temp)
243
  if (temp <= 0)
244
  {
245
  // Imitate greedy sampling
246
- temp = 0.01f; //cannot be zero else div0
247
  llama_sample_temperature(nullptr, candidates_p, temp);
248
  llama_sample_top_k(nullptr, candidates_p, 1, 1); //only want first candidate
249
  }
@@ -347,6 +347,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
347
  debugmode = inputs.debugmode;
348
  unbanTokens = inputs.unban_tokens;
349
  blasbatchsize = inputs.blasbatchsize;
 
 
 
 
350
  params.memory_f16 = inputs.f16_kv;
351
  params.n_ctx = inputs.max_context_length;
352
 
@@ -374,7 +378,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
374
  else
375
  {
376
  //approximate NTK aware ctx
377
- rope_freq_base = (params.n_ctx <= 3072 ? 26000.0f : (params.n_ctx <= 4096 ? 32000.0f : (params.n_ctx <= 6144 ? 54000.0f : 82684.0f)));
 
378
  }
379
 
380
  printf("Using automatic RoPE scaling (scale:%.3f, base:%.1f)\n",rope_freq_scale,rope_freq_base);
@@ -466,6 +471,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
466
  llama_ctx_params.seed = -1;
467
  llama_ctx_params.f16_kv = inputs.f16_kv;
468
  llama_ctx_params.low_vram = inputs.low_vram;
 
469
  llama_ctx_params.logits_all = false;
470
  llama_ctx_params.use_mmap = inputs.use_mmap;
471
  llama_ctx_params.use_mlock = inputs.use_mlock;
 
243
  if (temp <= 0)
244
  {
245
  // Imitate greedy sampling
246
+ temp = 0.00390625f; //cannot be zero else div0, this is 1/256
247
  llama_sample_temperature(nullptr, candidates_p, temp);
248
  llama_sample_top_k(nullptr, candidates_p, 1, 1); //only want first candidate
249
  }
 
347
  debugmode = inputs.debugmode;
348
  unbanTokens = inputs.unban_tokens;
349
  blasbatchsize = inputs.blasbatchsize;
350
+ if(blasbatchsize<=0)
351
+ {
352
+ blasbatchsize = 8;
353
+ }
354
  params.memory_f16 = inputs.f16_kv;
355
  params.n_ctx = inputs.max_context_length;
356
 
 
378
  else
379
  {
380
  //approximate NTK aware ctx
381
+ rope_freq_base = (params.n_ctx <= 3072 ? 26000.0f : (params.n_ctx <= 4096 ? 32000.0f : (params.n_ctx <= 6144 ? 54000.0f : (params.n_ctx <= 8192 ? 82684.0f : (params.n_ctx <= 12288 ? 140000.0f : 200000.0f)))));
382
+
383
  }
384
 
385
  printf("Using automatic RoPE scaling (scale:%.3f, base:%.1f)\n",rope_freq_scale,rope_freq_base);
 
471
  llama_ctx_params.seed = -1;
472
  llama_ctx_params.f16_kv = inputs.f16_kv;
473
  llama_ctx_params.low_vram = inputs.low_vram;
474
+ llama_ctx_params.mul_mat_q = inputs.use_mmq;
475
  llama_ctx_params.logits_all = false;
476
  llama_ctx_params.use_mmap = inputs.use_mmap;
477
  llama_ctx_params.use_mlock = inputs.use_mlock;
grammars/json.gbnf CHANGED
@@ -1,29 +1,25 @@
1
- # Grammar for subset of JSON - doesn't support full string or number syntax
2
-
3
- root ::= object
4
- value ::= object | array | string | number | boolean | "null"
5
 
6
  object ::=
7
  "{" ws (
8
  string ":" ws value
9
  ("," ws string ":" ws value)*
10
- )? "}"
11
 
12
  array ::=
13
  "[" ws (
14
  value
15
  ("," ws value)*
16
- )? "]"
17
 
18
- string ::=
19
  "\"" (
20
  [^"\\] |
21
  "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
22
  )* "\"" ws
23
 
24
- # Only plain integers currently
25
- number ::= "-"? [0-9]+ ws
26
- boolean ::= ("true" | "false") ws
27
 
28
  # Optional space: by convention, applied in this grammar after literal chars when allowed
29
  ws ::= ([ \t\n] ws)?
 
1
+ root ::= object
2
+ value ::= object | array | string | number | ("true" | "false" | "null") ws
 
 
3
 
4
  object ::=
5
  "{" ws (
6
  string ":" ws value
7
  ("," ws string ":" ws value)*
8
+ )? "}" ws
9
 
10
  array ::=
11
  "[" ws (
12
  value
13
  ("," ws value)*
14
+ )? "]" ws
15
 
16
+ string ::=
17
  "\"" (
18
  [^"\\] |
19
  "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
20
  )* "\"" ws
21
 
22
+ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 
 
23
 
24
  # Optional space: by convention, applied in this grammar after literal chars when allowed
25
  ws ::= ([ \t\n] ws)?
k_quants.c CHANGED
@@ -39,6 +39,8 @@
39
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
40
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
41
 
 
 
42
  //
43
  // 2-6 bit quantization in super-blocks
44
  //
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1353
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
1354
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1355
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1356
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1357
 
1358
  __m256i sumi = _mm256_setzero_si256();
1359
 
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1421
  const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1422
 
1423
  // sumf += -dmin * summs in 32bits*8
1424
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
1425
 
1426
  const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1427
  const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1493
  }
1494
 
1495
  // sumf += dall * isum - dmin * summs in 32bits
1496
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
1497
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1498
  }
1499
 
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1644
  summs += dmin * smin;
1645
 
1646
  const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1647
- const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1648
- const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
 
1650
  const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1651
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1709
  const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
1710
  const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
1711
 
1712
- const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1713
- const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1714
- const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1715
- const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1716
 
1717
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
1718
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
1917
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
1918
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1919
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1920
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1921
 
1922
  // high bit
1923
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2128
  }
2129
 
2130
  // multiply with block scale and accumulate
2131
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2132
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
2133
 
2134
  }
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2303
  aux16[0] = a & 0x0f0f;
2304
  aux16[1] = (a >> 4) & 0x0f0f;
2305
 
2306
- const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2307
- const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
 
2309
  memcpy(&aux64, x[i].hmask, 8);
2310
 
2311
  const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2312
- __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
2313
  __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
2314
  q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
2315
  q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2318
  const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2319
 
2320
  // prepare low and high bits
2321
- const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
2322
  const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
2323
  const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
2324
 
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2429
 
2430
  p16_0 = _mm_add_epi32(p16_0, p16_2);
2431
  p16_1 = _mm_add_epi32(p16_1, p16_3);
2432
- __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
2433
 
2434
  // multiply with block scale and accumulate
2435
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2620
  acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
2621
 
2622
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
2623
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
2624
 
2625
  __m256i sumi = _mm256_setzero_si256();
2626
 
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2727
  }
2728
 
2729
  __m256 vd = _mm256_set1_ps(d);
2730
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2731
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2732
 
2733
  }
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2968
 
2969
  const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
2970
  const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2971
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
2972
 
2973
  const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
2974
  const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2975
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
2976
 
2977
  }
2978
 
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3160
  summs += dmin * _mm_extract_epi32(hsum, 0);
3161
 
3162
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
3163
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
3164
 
3165
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
3166
  __m256i hmask = mone;
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3299
  }
3300
 
3301
  __m256 vd = _mm256_set1_ps(d);
3302
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3303
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
3304
 
3305
  }
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3462
 
3463
  const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3464
 
3465
- const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3466
- const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
 
3468
  int64_t aux64;
3469
  memcpy(&aux64, x[i].qh, 8);
3470
  const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
3471
- const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
3472
 
3473
  const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
3474
  const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3543
  const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
3544
  const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
3545
 
3546
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
3547
 
3548
  }
3549
 
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3925
 
3926
  }
3927
 
3928
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3929
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
3930
  }
3931
 
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4083
  const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
4084
  const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
4085
 
4086
- const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4087
- const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
 
4089
  const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
4090
  const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4177
  sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
4178
  sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
4179
 
4180
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
4181
  }
4182
 
4183
  *s = hsum_float_8(acc);
 
39
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
40
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
41
 
42
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
43
+
44
  //
45
  // 2-6 bit quantization in super-blocks
46
  //
 
1355
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
1356
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1357
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1358
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1359
 
1360
  __m256i sumi = _mm256_setzero_si256();
1361
 
 
1423
  const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1424
 
1425
  // sumf += -dmin * summs in 32bits*8
1426
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
1427
 
1428
  const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1429
  const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
 
1495
  }
1496
 
1497
  // sumf += dall * isum - dmin * summs in 32bits
1498
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1499
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1500
  }
1501
 
 
1646
  summs += dmin * smin;
1647
 
1648
  const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1649
+ const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1650
+ const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1651
 
1652
  const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1653
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
 
1711
  const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
1712
  const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
1713
 
1714
+ const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1715
+ const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1716
+ const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1717
+ const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1718
 
1719
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
1720
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
 
1919
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
1920
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1921
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1922
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1923
 
1924
  // high bit
1925
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
 
2130
  }
2131
 
2132
  // multiply with block scale and accumulate
2133
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2134
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
2135
 
2136
  }
 
2305
  aux16[0] = a & 0x0f0f;
2306
  aux16[1] = (a >> 4) & 0x0f0f;
2307
 
2308
+ const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2309
+ const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2310
 
2311
  memcpy(&aux64, x[i].hmask, 8);
2312
 
2313
  const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2314
+ __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
2315
  __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
2316
  q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
2317
  q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
 
2320
  const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2321
 
2322
  // prepare low and high bits
2323
+ const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
2324
  const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
2325
  const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
2326
 
 
2431
 
2432
  p16_0 = _mm_add_epi32(p16_0, p16_2);
2433
  p16_1 = _mm_add_epi32(p16_1, p16_3);
2434
+ __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
2435
 
2436
  // multiply with block scale and accumulate
2437
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
 
2622
  acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
2623
 
2624
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
2625
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
2626
 
2627
  __m256i sumi = _mm256_setzero_si256();
2628
 
 
2729
  }
2730
 
2731
  __m256 vd = _mm256_set1_ps(d);
2732
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2733
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2734
 
2735
  }
 
2970
 
2971
  const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
2972
  const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2973
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
2974
 
2975
  const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
2976
  const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2977
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
2978
 
2979
  }
2980
 
 
3162
  summs += dmin * _mm_extract_epi32(hsum, 0);
3163
 
3164
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
3165
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
3166
 
3167
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
3168
  __m256i hmask = mone;
 
3301
  }
3302
 
3303
  __m256 vd = _mm256_set1_ps(d);
3304
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3305
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
3306
 
3307
  }
 
3464
 
3465
  const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3466
 
3467
+ const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3468
+ const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3469
 
3470
  int64_t aux64;
3471
  memcpy(&aux64, x[i].qh, 8);
3472
  const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
3473
+ const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
3474
 
3475
  const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
3476
  const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
 
3545
  const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
3546
  const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
3547
 
3548
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
3549
 
3550
  }
3551
 
 
3927
 
3928
  }
3929
 
3930
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3931
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
3932
  }
3933
 
 
4085
  const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
4086
  const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
4087
 
4088
+ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4089
+ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4090
 
4091
  const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
4092
  const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
 
4179
  sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
4180
  sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
4181
 
4182
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
4183
  }
4184
 
4185
  *s = hsum_float_8(acc);
klite.embd CHANGED
The diff for this file is too large to render. See raw diff
 
koboldcpp.py CHANGED
@@ -23,6 +23,7 @@ class load_model_inputs(ctypes.Structure):
23
  ("batch_size", ctypes.c_int),
24
  ("f16_kv", ctypes.c_bool),
25
  ("low_vram", ctypes.c_bool),
 
26
  ("executable_path", ctypes.c_char_p),
27
  ("model_filename", ctypes.c_char_p),
28
  ("lora_filename", ctypes.c_char_p),
@@ -65,7 +66,7 @@ class generation_inputs(ctypes.Structure):
65
 
66
  class generation_outputs(ctypes.Structure):
67
  _fields_ = [("status", ctypes.c_int),
68
- ("text", ctypes.c_char * 16384)]
69
 
70
  handle = None
71
 
@@ -89,29 +90,30 @@ def pick_existant_file(ntoption,nonntoption):
89
  lib_default = pick_existant_file("koboldcpp.dll","koboldcpp.so")
90
  lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so")
91
  lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so")
92
- lib_openblas_noavx2 = pick_existant_file("koboldcpp_openblas_noavx2.dll","koboldcpp_openblas_noavx2.so")
93
  lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
94
  lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so")
95
 
96
 
97
  def init_library():
98
  global handle
99
- global lib_default,lib_failsafe,lib_openblas,lib_openblas_noavx2,lib_clblast,lib_cublas
100
 
101
  libname = ""
102
- use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
103
  use_clblast = False #uses CLBlast instead
104
  use_cublas = False #uses cublas instead
105
- use_noavx2 = False #uses openblas with no avx2 instructions
 
106
  if args.noavx2:
107
  use_noavx2 = True
108
- if not file_exists(lib_openblas_noavx2) or (os.name=='nt' and not file_exists("libopenblas.dll")):
109
- print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
110
- elif args.noblas:
 
111
  print("!!! Attempting to use FAILSAFE MODE !!!")
112
  else:
113
- use_blas = True
114
- print("Attempting to use non-avx2 compatibility library with OpenBLAS. A compatible libopenblas will be required.")
115
  elif args.useclblast:
116
  if not file_exists(lib_clblast) or (os.name=='nt' and not file_exists("clblast.dll")):
117
  print("Warning: CLBlast library file not found. Non-BLAS library will be used.")
@@ -130,22 +132,22 @@ def init_library():
130
  elif args.noblas:
131
  print("Attempting to library without OpenBLAS.")
132
  else:
133
- use_blas = True
134
  print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.")
135
  if sys.platform=="darwin":
136
  print("Mac OSX note: Some people have found Accelerate actually faster than OpenBLAS. To compare, run Koboldcpp with --noblas instead.")
137
 
138
  if use_noavx2:
139
- if use_blas:
140
- libname = lib_openblas_noavx2
141
- else:
142
  libname = lib_failsafe
 
 
143
  else:
144
  if use_clblast:
145
  libname = lib_clblast
146
  elif use_cublas:
147
  libname = lib_cublas
148
- elif use_blas:
149
  libname = lib_openblas
150
  else:
151
  libname = lib_default
@@ -178,6 +180,7 @@ def load_model(model_filename):
178
  inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
179
  inputs.threads = args.threads
180
  inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
 
181
  inputs.blasthreads = args.blasthreads
182
  inputs.f16_kv = True
183
  inputs.use_mmap = (not args.nommap)
@@ -229,12 +232,17 @@ def load_model(model_filename):
229
  return ret
230
 
231
  def generate(prompt,max_length=20, max_context_length=512, temperature=0.8, top_k=120, top_a=0.0, top_p=0.85, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], stream_sse=False):
 
232
  inputs = generation_inputs()
233
  outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
234
  inputs.prompt = prompt.encode("UTF-8")
235
  if max_length >= max_context_length:
236
  max_length = max_context_length-1
237
  inputs.max_context_length = max_context_length # this will resize the context buffer if changed
 
 
 
 
238
  inputs.max_length = max_length
239
  inputs.temperature = temperature
240
  inputs.top_k = top_k
@@ -295,9 +303,10 @@ maxhordectx = 1024
295
  maxhordelen = 256
296
  modelbusy = threading.Lock()
297
  defaultport = 5001
298
- KcppVersion = "1.37.1"
299
  showdebug = True
300
  showsamplerwarning = True
 
301
  exitcounter = 0
302
 
303
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
@@ -393,6 +402,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
393
 
394
  current_token = 0
395
 
 
396
  while not handle.has_finished():
397
  if current_token < handle.get_stream_count():
398
  token = handle.new_token(current_token)
@@ -402,10 +412,14 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
402
 
403
  current_token += 1
404
 
405
- tokenStr = ctypes.string_at(token).decode("UTF-8","ignore")
406
- event_data = {"token": tokenStr}
407
- event_str = json.dumps(event_data)
408
- await self.send_sse_event("message", event_str)
 
 
 
 
409
 
410
  await asyncio.sleep(0)
411
 
@@ -481,7 +495,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
481
  laste = handle.get_last_eval_time()
482
  lastc = handle.get_last_token_count()
483
  stopreason = handle.get_last_stop_reason()
484
- response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason}).encode())
485
 
486
  if response_body is None:
487
  self.send_response(404)
@@ -558,9 +572,12 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
558
  newprompt = fullprompt
559
 
560
  gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag))
 
561
  try:
562
- self.send_response(200)
563
- self.end_headers()
 
 
564
  self.wfile.write(json.dumps(gen).encode())
565
  except:
566
  print("Generate: The response could not be sent, maybe connection was terminated?")
@@ -629,7 +646,7 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None):
629
  exitcounter = 999
630
  self.httpd.server_close()
631
 
632
- numThreads = 6
633
  threadArr = []
634
  for i in range(numThreads):
635
  threadArr.append(Thread(i))
@@ -656,14 +673,13 @@ def show_new_gui():
656
  root.destroy()
657
  if not args.model_param:
658
  print("\nNo ggml model file was selected. Exiting.")
659
- time.sleep(2)
660
  sys.exit(2)
661
  return
662
 
663
  import customtkinter as ctk
664
-
665
  nextstate = 0 #0=exit, 1=launch, 2=oldgui
666
- windowwidth = 520
667
  windowheight = 500
668
  ctk.set_appearance_mode("dark")
669
  root = ctk.CTk()
@@ -684,13 +700,22 @@ def show_new_gui():
684
  tabcontentframe.grid_propagate(False)
685
 
686
  tabcontent = {}
687
-
 
 
 
 
 
 
 
688
  # slider data
689
- blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024"]
690
- blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024"]
691
- contextsize_text = ["512", "1024", "2048", "3072", "4096", "6144", "8192"]
692
- runopts = ["Use OpenBLAS","Use CLBlast", "Use CuBLAS", "Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"]
693
-
 
 
694
  def tabbuttonaction(name):
695
  for t in tabcontent:
696
  if name == t:
@@ -756,6 +781,32 @@ def show_new_gui():
756
  button.grid(row=row+1, column=1, stick="nw")
757
  return
758
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
759
  # Vars - should be in scope to be used by multiple widgets
760
  gpulayers_var = ctk.StringVar(value="0")
761
  threads_var = ctk.StringVar(value=str(default_threads))
@@ -770,6 +821,7 @@ def show_new_gui():
770
  debugmode = ctk.IntVar()
771
 
772
  lowvram_var = ctk.IntVar()
 
773
 
774
  blas_threads_var = ctk.StringVar()
775
  blas_size_var = ctk.IntVar()
@@ -806,11 +858,13 @@ def show_new_gui():
806
  quick_tab = tabcontent["Quick Launch"]
807
 
808
  # gpu options
809
- quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 4, 50)
810
  quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3)
811
  quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly")
812
  CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly")
813
- quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 5)
 
 
814
 
815
  def changerunmode(a,b,c):
816
  index = runopts_var.get()
@@ -836,9 +890,13 @@ def show_new_gui():
836
  if index == "Use CuBLAS":
837
  lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
838
  quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
 
 
839
  else:
840
  lowvram_box.grid_forget()
841
  quick_lowvram_box.grid_forget()
 
 
842
 
843
  if index == "Use CLBlast" or index == "Use CuBLAS":
844
  gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
@@ -856,19 +914,21 @@ def show_new_gui():
856
 
857
  runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
858
  runoptbox.grid(row=1, column=1,padx=8, stick="nw")
859
- runoptbox.set("Use OpenBLAS")
 
 
 
860
 
861
  # threads
862
  makelabelentry(quick_tab, "Threads:" , threads_var, 8, 50)
863
 
864
  # blas batch size
865
- makeslider(quick_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 6, 12, set=5)
866
 
867
  # quick boxes
868
  quick_boxes = {"Launch Browser": launchbrowser , "High Priority" : highpriority, "Streaming Mode":stream, "Use SmartContext":smartcontext, "Unban Tokens":unbantokens, "Disable MMAP":disablemmap,}
869
  for idx, name, in enumerate(quick_boxes):
870
  makecheckbox(quick_tab, name, quick_boxes[name], int(idx/2) +20, idx%2)
871
-
872
  # context size
873
  makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
874
 
@@ -879,19 +939,24 @@ def show_new_gui():
879
  hardware_tab = tabcontent["Hardware"]
880
 
881
  # gpu options
882
- gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 4, 50)
883
  gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3)
884
  gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly")
885
  CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3", "All"], width=60, variable=gpu_choice_var, state="readonly")
886
- lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 5)
 
887
 
888
  # presets selector
889
  makelabel(hardware_tab, "Presets:", 1)
890
  runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
891
  runoptbox.grid(row=1, column=1,padx=8, stick="nw")
892
- runoptbox.set("Use OpenBLAS")
893
  runopts_var.trace('w', changerunmode)
894
  changerunmode(1,1,1)
 
 
 
 
895
  # threads
896
  makelabelentry(hardware_tab, "Threads:" , threads_var, 8, 50)
897
 
@@ -904,7 +969,7 @@ def show_new_gui():
904
  # blas thread specifier
905
  makelabelentry(hardware_tab, "BLAS threads:" , blas_threads_var, 11, 50)
906
  # blas batch size
907
- makeslider(hardware_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 6, 12, set=5)
908
  # force version
909
  makelabelentry(hardware_tab, "Force Version:" , version_var, 100, 50)
910
 
@@ -931,7 +996,7 @@ def show_new_gui():
931
  togglemiro(1,1,1)
932
 
933
  # context size
934
- makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, 4, 20, set=2)
935
 
936
 
937
  customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale)
@@ -1017,20 +1082,22 @@ def show_new_gui():
1017
  gpuchoiceidx = 0
1018
  if gpu_choice_var.get()!="All":
1019
  gpuchoiceidx = int(gpu_choice_var.get())-1
1020
- if runopts_var.get() == runopts[1]:
1021
  args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx]
1022
- if runopts_var.get() == runopts[2]:
1023
  if gpu_choice_var.get()=="All":
1024
  args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"]
1025
  else:
1026
  args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)]
 
 
1027
  if gpulayers_var.get():
1028
  args.gpulayers = int(gpulayers_var.get())
1029
- if runopts_var.get()==runopts[3]:
1030
  args.noblas = True
1031
- if runopts_var.get()==runopts[4]:
1032
  args.noavx2 = True
1033
- if runopts_var.get()==runopts[5]:
1034
  args.noavx2 = True
1035
  args.noblas = True
1036
  args.nommap = True
@@ -1040,7 +1107,7 @@ def show_new_gui():
1040
  args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())])
1041
  args.forceversion = 0 if version_var.get()=="" else int(version_var.get())
1042
 
1043
- args.mirostat = [int(mirostat_var.get()), float(mirostat_tau.get()), float(mirostat_eta.get())] if usemirostat.get()==1 else None
1044
  args.contextsize = int(contextsize_text[context_var.get()])
1045
 
1046
  if customrope_var.get()==1:
@@ -1069,38 +1136,39 @@ def show_new_gui():
1069
  stream.set(1 if "stream" in dict and dict["stream"] else 0)
1070
  smartcontext.set(1 if "smartcontext" in dict and dict["smartcontext"] else 0)
1071
  unbantokens.set(1 if "unbantokens" in dict and dict["unbantokens"] else 0)
1072
- runopts_var.set(runopts[0])
1073
  if "useclblast" in dict and dict["useclblast"]:
1074
- runopts_var.set(runopts[1])
1075
- gpu_choice_var.set(str(["0 0", "1 0", "0 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1))
 
1076
  elif "usecublas" in dict and dict["usecublas"]:
1077
- runopts_var.set(runopts[2])
1078
- if len(dict["usecublas"])==1:
1079
- lowvram_var.set(1 if dict["usecublas"][0]=="lowvram" else 0)
1080
- else:
1081
  lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0)
1082
- gpu_choice_var.set("1")
 
1083
  for g in range(3):
1084
  if str(g) in dict["usecublas"]:
1085
  gpu_choice_var.set(str(g+1))
1086
  break
1087
- if "gpulayers" in dict and dict["gpulayers"]:
1088
- gpulayers_var.set(dict["gpulayers"])
1089
-
1090
- if "noavx2" in dict and "noblas" in dict and dict["noblas"] and dict["noavx2"]:
1091
- runopts_var.set(runopts[5])
1092
  elif "noavx2" in dict and dict["noavx2"]:
1093
- runopts_var.set(runopts[4])
 
1094
  elif "noblas" in dict and dict["noblas"]:
1095
- runopts_var.set(runopts[3])
 
 
 
 
 
1096
  if "blasthreads" in dict and dict["blasthreads"]:
1097
  blas_threads_var.set(str(dict["blasthreads"]))
1098
  else:
1099
  blas_threads_var.set("")
1100
-
1101
  if "contextsize" in dict and dict["contextsize"]:
1102
  context_var.set(contextsize_text.index(str(dict["contextsize"])))
1103
-
1104
  if "ropeconfig" in dict and dict["ropeconfig"] and len(dict["ropeconfig"])>1:
1105
  if dict["ropeconfig"][0]>0:
1106
  customrope_var.set(1)
@@ -1114,11 +1182,11 @@ def show_new_gui():
1114
  if "forceversion" in dict and dict["forceversion"]:
1115
  version_var.set(str(dict["forceversion"]))
1116
 
1117
- if "mirostat" in dict and dict["mirostat"] and len(dict["mirostat"])>1:
1118
- usemirostat.set(0 if str(dict["mirostat"][0])=="0" else 1)
1119
- mirostat_var.set(str(dict["mirostat"][0]))
1120
- mirostat_tau.set(str(dict["mirostat"][1]))
1121
- mirostat_eta.set(str(dict["mirostat"][2]))
1122
 
1123
  if "model_param" in dict and dict["model_param"]:
1124
  model_var.set(dict["model_param"])
@@ -1165,18 +1233,26 @@ def show_new_gui():
1165
  import_vars(dict)
1166
  pass
1167
 
1168
- ctk.CTkButton(tabs , text = "Launch", fg_color="#2f8d3c", command = guilaunch, width=80, height = 35 ).grid(row=1,column=1, stick="se", padx= 25, pady=5)
 
 
 
 
 
 
 
1169
 
1170
- ctk.CTkButton(tabs , text = "Save", fg_color="#084a66", command = save_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 5, pady=5)
1171
- ctk.CTkButton(tabs , text = "Load", fg_color="#084a66", command = load_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 70, pady=5)
 
1172
 
1173
- ctk.CTkButton(tabs , text = "Old GUI", fg_color="#084a66", command = switch_old_gui, width=100, height = 35 ).grid(row=1,column=0, stick="sw", padx= 5, pady=5)
1174
  # runs main loop until closed or launch clicked
1175
  root.mainloop()
1176
 
1177
  if nextstate==0:
1178
  print("Exiting by user request.")
1179
- time.sleep(2)
1180
  sys.exit()
1181
  elif nextstate==2:
1182
  time.sleep(0.1)
@@ -1187,16 +1263,23 @@ def show_new_gui():
1187
 
1188
  if not args.model_param:
1189
  print("\nNo ggml model file was selected. Exiting.")
1190
- time.sleep(2)
1191
  sys.exit(2)
1192
 
1193
- def show_gui_warning():
1194
  from tkinter import messagebox
1195
  import tkinter as tk
1196
  root = tk.Tk()
1197
  root.attributes("-alpha", 0)
1198
- messagebox.showerror(title="New GUI failed, using Old GUI", message="The new GUI failed to load.\n\nTo use new GUI, please install the customtkinter python module.")
1199
- root.destroy()
 
 
 
 
 
 
 
1200
 
1201
  def show_old_gui():
1202
  import tkinter as tk
@@ -1223,11 +1306,11 @@ def show_old_gui():
1223
  tk.Label(root, text = "(Note: KoboldCpp only works with GGML model formats!)",
1224
  font = ("Arial", 9)).grid(row=1,column=0)
1225
 
1226
- blasbatchopts = ["Don't Batch BLAS","BLAS = 32","BLAS = 64","BLAS = 128","BLAS = 256","BLAS = 512","BLAS = 1024"]
1227
  blaschoice = tk.StringVar()
1228
  blaschoice.set("BLAS = 512")
1229
 
1230
- runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"]
1231
  runchoice = tk.StringVar()
1232
  runchoice.set("Use OpenBLAS")
1233
 
@@ -1286,7 +1369,7 @@ def show_old_gui():
1286
 
1287
  if launchclicked==False:
1288
  print("Exiting by user request.")
1289
- time.sleep(2)
1290
  sys.exit()
1291
 
1292
  #load all the vars
@@ -1318,7 +1401,6 @@ def show_old_gui():
1318
  args.noavx2 = True
1319
  args.noblas = True
1320
  args.nommap = True
1321
- print("[Failsafe Mode : mmap is disabled.]")
1322
 
1323
  if selblaschoice==blasbatchopts[0]:
1324
  args.blasbatchsize = -1
@@ -1334,6 +1416,8 @@ def show_old_gui():
1334
  args.blasbatchsize = 512
1335
  if selblaschoice==blasbatchopts[6]:
1336
  args.blasbatchsize = 1024
 
 
1337
 
1338
  root = tk.Tk()
1339
  root.attributes("-alpha", 0)
@@ -1341,7 +1425,7 @@ def show_old_gui():
1341
  root.destroy()
1342
  if not args.model_param:
1343
  print("\nNo ggml model file was selected. Exiting.")
1344
- time.sleep(2)
1345
  sys.exit(2)
1346
 
1347
  else:
@@ -1351,7 +1435,7 @@ def show_old_gui():
1351
  root.destroy()
1352
  if not args.model_param:
1353
  print("\nNo ggml model file was selected. Exiting.")
1354
- time.sleep(2)
1355
  sys.exit(2)
1356
 
1357
  #A very simple and stripped down embedded horde worker with no dependencies
@@ -1396,7 +1480,7 @@ def run_horde_worker(args, api_key, worker_name):
1396
  BRIDGE_AGENT = f"KoboldCppEmbedWorker:1:https://github.com/LostRuins/koboldcpp"
1397
  cluster = "https://horde.koboldai.net"
1398
  while exitcounter < 10:
1399
- time.sleep(2)
1400
  readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
1401
  if readygo:
1402
  print("Embedded Horde Worker is started.")
@@ -1472,10 +1556,10 @@ def run_horde_worker(args, api_key, worker_name):
1472
  time.sleep(1)
1473
  if exitcounter<100:
1474
  print("Horde Worker Shutdown - Too many errors.")
1475
- time.sleep(2)
1476
  else:
1477
  print("Horde Worker Shutdown - Server Closing.")
1478
- time.sleep(1)
1479
  sys.exit(2)
1480
 
1481
  def main(args):
@@ -1499,7 +1583,7 @@ def main(args):
1499
  except Exception as ex2:
1500
  print("File selection GUI unsupported. Please check command line: script.py --help")
1501
  print("Reason for no GUI: " + str(ex2))
1502
- time.sleep(2)
1503
  sys.exit(2)
1504
 
1505
  if args.hordeconfig and args.hordeconfig[0]!="":
@@ -1543,20 +1627,20 @@ def main(args):
1543
  time.sleep(1)
1544
  if not os.path.exists(args.model_param):
1545
  print(f"Cannot find model file: {args.model_param}")
1546
- time.sleep(2)
1547
  sys.exit(2)
1548
 
1549
  if args.lora and args.lora[0]!="":
1550
  if not os.path.exists(args.lora[0]):
1551
  print(f"Cannot find lora file: {args.lora[0]}")
1552
- time.sleep(2)
1553
  sys.exit(2)
1554
  else:
1555
  args.lora[0] = os.path.abspath(args.lora[0])
1556
  if len(args.lora) > 1:
1557
  if not os.path.exists(args.lora[1]):
1558
  print(f"Cannot find lora base: {args.lora[1]}")
1559
- time.sleep(2)
1560
  sys.exit(2)
1561
  else:
1562
  args.lora[1] = os.path.abspath(args.lora[1])
@@ -1577,7 +1661,7 @@ def main(args):
1577
 
1578
  if not loadok:
1579
  print("Could not load model: " + modelname)
1580
- time.sleep(2)
1581
  sys.exit(3)
1582
  try:
1583
  basepath = os.path.abspath(os.path.dirname(__file__))
@@ -1605,6 +1689,7 @@ def main(args):
1605
 
1606
  if args.hordeconfig and len(args.hordeconfig)>4:
1607
  horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4]))
 
1608
  horde_thread.start()
1609
 
1610
  print(f"Please connect to custom endpoint at {epurl}")
@@ -1631,8 +1716,8 @@ if __name__ == '__main__':
1631
  parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
1632
  parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
1633
  parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
1634
- parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048)
1635
- parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
1636
  parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
1637
  parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
1638
  parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
@@ -1649,7 +1734,7 @@ if __name__ == '__main__':
1649
  compatgroup = parser.add_mutually_exclusive_group()
1650
  compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
1651
  compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
1652
- compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2'])
1653
  parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
1654
  parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
1655
 
 
23
  ("batch_size", ctypes.c_int),
24
  ("f16_kv", ctypes.c_bool),
25
  ("low_vram", ctypes.c_bool),
26
+ ("use_mmq", ctypes.c_bool),
27
  ("executable_path", ctypes.c_char_p),
28
  ("model_filename", ctypes.c_char_p),
29
  ("lora_filename", ctypes.c_char_p),
 
66
 
67
  class generation_outputs(ctypes.Structure):
68
  _fields_ = [("status", ctypes.c_int),
69
+ ("text", ctypes.c_char * 24576)]
70
 
71
  handle = None
72
 
 
90
  lib_default = pick_existant_file("koboldcpp.dll","koboldcpp.so")
91
  lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so")
92
  lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so")
93
+ lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so")
94
  lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
95
  lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so")
96
 
97
 
98
  def init_library():
99
  global handle
100
+ global lib_default,lib_failsafe,lib_openblas,lib_noavx2,lib_clblast,lib_cublas
101
 
102
  libname = ""
103
+ use_openblas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
104
  use_clblast = False #uses CLBlast instead
105
  use_cublas = False #uses cublas instead
106
+ use_noavx2 = False #uses no avx2 instructions
107
+ use_failsafe = False #uses no intrinsics, failsafe mode
108
  if args.noavx2:
109
  use_noavx2 = True
110
+ if not file_exists(lib_noavx2):
111
+ print("Warning: NoAVX2 library file not found. Failsafe library will be used.")
112
+ elif (args.noblas and args.nommap):
113
+ use_failsafe = True
114
  print("!!! Attempting to use FAILSAFE MODE !!!")
115
  else:
116
+ print("Attempting to use non-avx2 compatibility library.")
 
117
  elif args.useclblast:
118
  if not file_exists(lib_clblast) or (os.name=='nt' and not file_exists("clblast.dll")):
119
  print("Warning: CLBlast library file not found. Non-BLAS library will be used.")
 
132
  elif args.noblas:
133
  print("Attempting to library without OpenBLAS.")
134
  else:
135
+ use_openblas = True
136
  print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas will be required.")
137
  if sys.platform=="darwin":
138
  print("Mac OSX note: Some people have found Accelerate actually faster than OpenBLAS. To compare, run Koboldcpp with --noblas instead.")
139
 
140
  if use_noavx2:
141
+ if use_failsafe:
 
 
142
  libname = lib_failsafe
143
+ else:
144
+ libname = lib_noavx2
145
  else:
146
  if use_clblast:
147
  libname = lib_clblast
148
  elif use_cublas:
149
  libname = lib_cublas
150
+ elif use_openblas:
151
  libname = lib_openblas
152
  else:
153
  libname = lib_default
 
180
  inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
181
  inputs.threads = args.threads
182
  inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
183
+ inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False)
184
  inputs.blasthreads = args.blasthreads
185
  inputs.f16_kv = True
186
  inputs.use_mmap = (not args.nommap)
 
232
  return ret
233
 
234
  def generate(prompt,max_length=20, max_context_length=512, temperature=0.8, top_k=120, top_a=0.0, top_p=0.85, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], stream_sse=False):
235
+ global maxctx
236
  inputs = generation_inputs()
237
  outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
238
  inputs.prompt = prompt.encode("UTF-8")
239
  if max_length >= max_context_length:
240
  max_length = max_context_length-1
241
  inputs.max_context_length = max_context_length # this will resize the context buffer if changed
242
+ global showmaxctxwarning
243
+ if showmaxctxwarning and max_context_length > maxctx:
244
+ print(f"\n(Warning! Request max_context_length={max_context_length} exceeds allocated context size of {maxctx}. Consider launching with increased --contextsize to avoid errors. This message will only show once per session.)")
245
+ showmaxctxwarning = False
246
  inputs.max_length = max_length
247
  inputs.temperature = temperature
248
  inputs.top_k = top_k
 
303
  maxhordelen = 256
304
  modelbusy = threading.Lock()
305
  defaultport = 5001
306
+ KcppVersion = "1.40.1"
307
  showdebug = True
308
  showsamplerwarning = True
309
+ showmaxctxwarning = True
310
  exitcounter = 0
311
 
312
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
 
402
 
403
  current_token = 0
404
 
405
+ incomplete_token_buffer = bytearray()
406
  while not handle.has_finished():
407
  if current_token < handle.get_stream_count():
408
  token = handle.new_token(current_token)
 
412
 
413
  current_token += 1
414
 
415
+ newbyte = ctypes.string_at(token)
416
+ incomplete_token_buffer += bytearray(newbyte)
417
+ tokenStr = incomplete_token_buffer.decode("UTF-8","ignore")
418
+ if tokenStr!="":
419
+ incomplete_token_buffer.clear()
420
+ event_data = {"token": tokenStr}
421
+ event_str = json.dumps(event_data)
422
+ await self.send_sse_event("message", event_str)
423
 
424
  await asyncio.sleep(0)
425
 
 
495
  laste = handle.get_last_eval_time()
496
  lastc = handle.get_last_token_count()
497
  stopreason = handle.get_last_stop_reason()
498
+ response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason, "idle":(0 if modelbusy.locked() else 1)}).encode())
499
 
500
  if response_body is None:
501
  self.send_response(404)
 
572
  newprompt = fullprompt
573
 
574
  gen = asyncio.run(self.handle_request(genparams, newprompt, basic_api_flag, kai_sse_stream_flag))
575
+
576
  try:
577
+ # Headers are already sent when streaming
578
+ if not kai_sse_stream_flag:
579
+ self.send_response(200)
580
+ self.end_headers()
581
  self.wfile.write(json.dumps(gen).encode())
582
  except:
583
  print("Generate: The response could not be sent, maybe connection was terminated?")
 
646
  exitcounter = 999
647
  self.httpd.server_close()
648
 
649
+ numThreads = 8
650
  threadArr = []
651
  for i in range(numThreads):
652
  threadArr.append(Thread(i))
 
673
  root.destroy()
674
  if not args.model_param:
675
  print("\nNo ggml model file was selected. Exiting.")
676
+ time.sleep(3)
677
  sys.exit(2)
678
  return
679
 
680
  import customtkinter as ctk
 
681
  nextstate = 0 #0=exit, 1=launch, 2=oldgui
682
+ windowwidth = 530
683
  windowheight = 500
684
  ctk.set_appearance_mode("dark")
685
  root = ctk.CTk()
 
700
  tabcontentframe.grid_propagate(False)
701
 
702
  tabcontent = {}
703
+ lib_option_pairs = [
704
+ (lib_openblas, "Use OpenBLAS"),
705
+ (lib_clblast, "Use CLBlast"),
706
+ (lib_cublas, "Use CuBLAS"),
707
+ (lib_default, "Use No BLAS"),
708
+ (lib_noavx2, "NoAVX2 Mode (Old CPU)"),
709
+ (lib_failsafe, "Failsafe Mode (Old CPU)")]
710
+ openblas_option, clblast_option, cublas_option, default_option, noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
711
  # slider data
712
+ blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024", "2048"]
713
+ blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024","2048"]
714
+ contextsize_text = ["512", "1024", "2048", "3072", "4096", "6144", "8192", "12288", "16384"]
715
+ runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib) or os.name == 'nt' and file_exists(opt + ".dll")]
716
+ antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if not file_exists(lib) or os.name == 'nt' and not file_exists(opt + ".dll")]
717
+ if not any(runopts):
718
+ show_gui_warning("No Backend Available")
719
  def tabbuttonaction(name):
720
  for t in tabcontent:
721
  if name == t:
 
781
  button.grid(row=row+1, column=1, stick="nw")
782
  return
783
 
784
+ def show_tooltip(event, tooltip_text=None):
785
+ if hasattr(show_tooltip, "_tooltip"):
786
+ tooltip = show_tooltip._tooltip
787
+ else:
788
+ tooltip = ctk.CTkToplevel(root)
789
+ tooltip.configure(fg_color="#ffffe0")
790
+ tooltip.withdraw()
791
+ tooltip.overrideredirect(True)
792
+ tooltip_label = ctk.CTkLabel(tooltip, text=tooltip_text, text_color="#000000", fg_color="#ffffe0")
793
+ tooltip_label.pack(expand=True, padx=2, pady=1)
794
+ show_tooltip._tooltip = tooltip
795
+ x, y = root.winfo_pointerxy()
796
+ tooltip.wm_geometry(f"+{x + 10}+{y + 10}")
797
+ tooltip.deiconify()
798
+ def hide_tooltip(event):
799
+ if hasattr(show_tooltip, "_tooltip"):
800
+ tooltip = show_tooltip._tooltip
801
+ tooltip.withdraw()
802
+ def setup_backend_tooltip(parent):
803
+ num_backends_built = makelabel(parent, str(len(runopts)) + "/6", 5, 2)
804
+ num_backends_built.grid(row=1, column=2, padx=0, pady=0)
805
+ num_backends_built.configure(text_color="#00ff00")
806
+ # Bind the backend count label with the tooltip function
807
+ num_backends_built.bind("<Enter>", lambda event: show_tooltip(event, f"This is the number of backends you have built and available." + (f"\nMissing: {', '.join(antirunopts)}" if len(runopts) != 6 else "")))
808
+ num_backends_built.bind("<Leave>", hide_tooltip)
809
+
810
  # Vars - should be in scope to be used by multiple widgets
811
  gpulayers_var = ctk.StringVar(value="0")
812
  threads_var = ctk.StringVar(value=str(default_threads))
 
821
  debugmode = ctk.IntVar()
822
 
823
  lowvram_var = ctk.IntVar()
824
+ mmq_var = ctk.IntVar()
825
 
826
  blas_threads_var = ctk.StringVar()
827
  blas_size_var = ctk.IntVar()
 
858
  quick_tab = tabcontent["Quick Launch"]
859
 
860
  # gpu options
861
+ quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 5, 50)
862
  quick_gpu_selector_label = makelabel(quick_tab, "GPU ID:", 3)
863
  quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly")
864
  CUDA_quick_gpu_selector_box = ctk.CTkComboBox(quick_tab, values=["1","2","3","All"], width=60, variable=gpu_choice_var, state="readonly")
865
+ quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM", lowvram_var, 4,0)
866
+ quick_mmq_box = makecheckbox(quick_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
867
+
868
 
869
  def changerunmode(a,b,c):
870
  index = runopts_var.get()
 
890
  if index == "Use CuBLAS":
891
  lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
892
  quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
893
+ mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
894
+ quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
895
  else:
896
  lowvram_box.grid_forget()
897
  quick_lowvram_box.grid_forget()
898
+ mmq_box.grid_forget()
899
+ quick_mmq_box.grid_forget()
900
 
901
  if index == "Use CLBlast" or index == "Use CuBLAS":
902
  gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
 
914
 
915
  runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
916
  runoptbox.grid(row=1, column=1,padx=8, stick="nw")
917
+ runoptbox.set(runopts[0]) # Set to first available option
918
+
919
+ # Tell user how many backends are available
920
+ setup_backend_tooltip(quick_tab)
921
 
922
  # threads
923
  makelabelentry(quick_tab, "Threads:" , threads_var, 8, 50)
924
 
925
  # blas batch size
926
+ makeslider(quick_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 7, 12, set=5)
927
 
928
  # quick boxes
929
  quick_boxes = {"Launch Browser": launchbrowser , "High Priority" : highpriority, "Streaming Mode":stream, "Use SmartContext":smartcontext, "Unban Tokens":unbantokens, "Disable MMAP":disablemmap,}
930
  for idx, name, in enumerate(quick_boxes):
931
  makecheckbox(quick_tab, name, quick_boxes[name], int(idx/2) +20, idx%2)
 
932
  # context size
933
  makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
934
 
 
939
  hardware_tab = tabcontent["Hardware"]
940
 
941
  # gpu options
942
+ gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
943
  gpu_selector_label = makelabel(hardware_tab, "GPU ID:", 3)
944
  gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3"], width=60, variable=gpu_choice_var, state="readonly")
945
  CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3", "All"], width=60, variable=gpu_choice_var, state="readonly")
946
+ lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
947
+ mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
948
 
949
  # presets selector
950
  makelabel(hardware_tab, "Presets:", 1)
951
  runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
952
  runoptbox.grid(row=1, column=1,padx=8, stick="nw")
953
+ runoptbox.set(runopts[0]) # Set to first available option
954
  runopts_var.trace('w', changerunmode)
955
  changerunmode(1,1,1)
956
+
957
+ # Tell user how many backends are available
958
+ setup_backend_tooltip(hardware_tab)
959
+
960
  # threads
961
  makelabelentry(hardware_tab, "Threads:" , threads_var, 8, 50)
962
 
 
969
  # blas thread specifier
970
  makelabelentry(hardware_tab, "BLAS threads:" , blas_threads_var, 11, 50)
971
  # blas batch size
972
+ makeslider(hardware_tab, "BLAS Batch Size:", blasbatchsize_text, blas_size_var, 0, 7, 12, set=5)
973
  # force version
974
  makelabelentry(hardware_tab, "Force Version:" , version_var, 100, 50)
975
 
 
996
  togglemiro(1,1,1)
997
 
998
  # context size
999
+ makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 20, set=2)
1000
 
1001
 
1002
  customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale)
 
1082
  gpuchoiceidx = 0
1083
  if gpu_choice_var.get()!="All":
1084
  gpuchoiceidx = int(gpu_choice_var.get())-1
1085
+ if runopts_var.get() == "Use CLBlast":
1086
  args.useclblast = [[0,0], [1,0], [0,1]][gpuchoiceidx]
1087
+ if runopts_var.get() == "Use CuBLAS":
1088
  if gpu_choice_var.get()=="All":
1089
  args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"]
1090
  else:
1091
  args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)]
1092
+ if mmq_var.get()==1:
1093
+ args.usecublas.append("mmq")
1094
  if gpulayers_var.get():
1095
  args.gpulayers = int(gpulayers_var.get())
1096
+ if runopts_var.get()=="Use No BLAS":
1097
  args.noblas = True
1098
+ if runopts_var.get()=="NoAVX2 Mode (Old CPU)":
1099
  args.noavx2 = True
1100
+ if runopts_var.get()=="Failsafe Mode (Old CPU)":
1101
  args.noavx2 = True
1102
  args.noblas = True
1103
  args.nommap = True
 
1107
  args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())])
1108
  args.forceversion = 0 if version_var.get()=="" else int(version_var.get())
1109
 
1110
+ args.usemirostat = [int(mirostat_var.get()), float(mirostat_tau.get()), float(mirostat_eta.get())] if usemirostat.get()==1 else None
1111
  args.contextsize = int(contextsize_text[context_var.get()])
1112
 
1113
  if customrope_var.get()==1:
 
1136
  stream.set(1 if "stream" in dict and dict["stream"] else 0)
1137
  smartcontext.set(1 if "smartcontext" in dict and dict["smartcontext"] else 0)
1138
  unbantokens.set(1 if "unbantokens" in dict and dict["unbantokens"] else 0)
 
1139
  if "useclblast" in dict and dict["useclblast"]:
1140
+ if clblast_option is not None:
1141
+ runopts_var.set(clblast_option)
1142
+ gpu_choice_var.set(str(["0 0", "1 0", "0 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1))
1143
  elif "usecublas" in dict and dict["usecublas"]:
1144
+ if cublas_option is not None:
1145
+ runopts_var.set(cublas_option)
 
 
1146
  lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0)
1147
+ mmq_var.set(1 if "mmq" in dict["usecublas"] else 0)
1148
+ gpu_choice_var.set("All")
1149
  for g in range(3):
1150
  if str(g) in dict["usecublas"]:
1151
  gpu_choice_var.set(str(g+1))
1152
  break
1153
+ elif "noavx2" in dict and "noblas" in dict and dict["noblas"] and dict["noavx2"]:
1154
+ if failsafe_option is not None:
1155
+ runopts_var.set(failsafe_option)
 
 
1156
  elif "noavx2" in dict and dict["noavx2"]:
1157
+ if noavx2_option is not None:
1158
+ runopts_var.set(noavx2_option)
1159
  elif "noblas" in dict and dict["noblas"]:
1160
+ if default_option is not None:
1161
+ runopts_var.set(default_option)
1162
+ elif openblas_option is not None:
1163
+ runopts_var.set(openblas_option)
1164
+ if "gpulayers" in dict and dict["gpulayers"]:
1165
+ gpulayers_var.set(dict["gpulayers"])
1166
  if "blasthreads" in dict and dict["blasthreads"]:
1167
  blas_threads_var.set(str(dict["blasthreads"]))
1168
  else:
1169
  blas_threads_var.set("")
 
1170
  if "contextsize" in dict and dict["contextsize"]:
1171
  context_var.set(contextsize_text.index(str(dict["contextsize"])))
 
1172
  if "ropeconfig" in dict and dict["ropeconfig"] and len(dict["ropeconfig"])>1:
1173
  if dict["ropeconfig"][0]>0:
1174
  customrope_var.set(1)
 
1182
  if "forceversion" in dict and dict["forceversion"]:
1183
  version_var.set(str(dict["forceversion"]))
1184
 
1185
+ if "usemirostat" in dict and dict["usemirostat"] and len(dict["usemirostat"])>1:
1186
+ usemirostat.set(0 if str(dict["usemirostat"][0])=="0" else 1)
1187
+ mirostat_var.set(str(dict["usemirostat"][0]))
1188
+ mirostat_tau.set(str(dict["usemirostat"][1]))
1189
+ mirostat_eta.set(str(dict["usemirostat"][2]))
1190
 
1191
  if "model_param" in dict and dict["model_param"]:
1192
  model_var.set(dict["model_param"])
 
1233
  import_vars(dict)
1234
  pass
1235
 
1236
+ def display_help():
1237
+ try:
1238
+ import webbrowser as wb
1239
+ wb.open("https://github.com/LostRuins/koboldcpp/wiki")
1240
+ except:
1241
+ print("Cannot launch help browser.")
1242
+
1243
+ ctk.CTkButton(tabs , text = "Launch", fg_color="#2f8d3c", hover_color="#2faa3c", command = guilaunch, width=80, height = 35 ).grid(row=1,column=1, stick="se", padx= 25, pady=5)
1244
 
1245
+ ctk.CTkButton(tabs , text = "Save", fg_color="#084a66", hover_color="#085a88", command = save_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 5, pady=5)
1246
+ ctk.CTkButton(tabs , text = "Load", fg_color="#084a66", hover_color="#085a88", command = load_config, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 70, pady=5)
1247
+ ctk.CTkButton(tabs , text = "Help", fg_color="#992222", hover_color="#bb3333", command = display_help, width=60, height = 35 ).grid(row=1,column=1, stick="sw", padx= 135, pady=5)
1248
 
1249
+ ctk.CTkButton(tabs , text = "Old GUI", fg_color="#084a66", hover_color="#085a88", command = switch_old_gui, width=100, height = 35 ).grid(row=1,column=0, stick="sw", padx= 5, pady=5)
1250
  # runs main loop until closed or launch clicked
1251
  root.mainloop()
1252
 
1253
  if nextstate==0:
1254
  print("Exiting by user request.")
1255
+ time.sleep(3)
1256
  sys.exit()
1257
  elif nextstate==2:
1258
  time.sleep(0.1)
 
1263
 
1264
  if not args.model_param:
1265
  print("\nNo ggml model file was selected. Exiting.")
1266
+ time.sleep(3)
1267
  sys.exit(2)
1268
 
1269
+ def show_gui_warning(issue=None):
1270
  from tkinter import messagebox
1271
  import tkinter as tk
1272
  root = tk.Tk()
1273
  root.attributes("-alpha", 0)
1274
+ if issue == "No Backend Available":
1275
+ messagebox.showerror(title="No Backends Available!", message="KoboldCPP couldn't locate any backends to use.\n\nTo use the program, please run the 'make' command from the directory.")
1276
+ root.destroy()
1277
+ print("No Backend Available (i.e Default, OpenBLAS, CLBlast, CuBLAS). To use the program, please run the 'make' command from the directory.")
1278
+ time.sleep(3)
1279
+ sys.exit(2)
1280
+ else:
1281
+ messagebox.showerror(title="New GUI failed, using Old GUI", message="The new GUI failed to load.\n\nTo use new GUI, please install the customtkinter python module.")
1282
+ root.destroy()
1283
 
1284
  def show_old_gui():
1285
  import tkinter as tk
 
1306
  tk.Label(root, text = "(Note: KoboldCpp only works with GGML model formats!)",
1307
  font = ("Arial", 9)).grid(row=1,column=0)
1308
 
1309
+ blasbatchopts = ["Don't Batch BLAS","BLAS = 32","BLAS = 64","BLAS = 128","BLAS = 256","BLAS = 512","BLAS = 1024","BLAS = 2048"]
1310
  blaschoice = tk.StringVar()
1311
  blaschoice.set("BLAS = 512")
1312
 
1313
+ runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","NoAVX2 Mode (Old CPU)","Failsafe Mode (Old CPU)"]
1314
  runchoice = tk.StringVar()
1315
  runchoice.set("Use OpenBLAS")
1316
 
 
1369
 
1370
  if launchclicked==False:
1371
  print("Exiting by user request.")
1372
+ time.sleep(3)
1373
  sys.exit()
1374
 
1375
  #load all the vars
 
1401
  args.noavx2 = True
1402
  args.noblas = True
1403
  args.nommap = True
 
1404
 
1405
  if selblaschoice==blasbatchopts[0]:
1406
  args.blasbatchsize = -1
 
1416
  args.blasbatchsize = 512
1417
  if selblaschoice==blasbatchopts[6]:
1418
  args.blasbatchsize = 1024
1419
+ if selblaschoice==blasbatchopts[7]:
1420
+ args.blasbatchsize = 2048
1421
 
1422
  root = tk.Tk()
1423
  root.attributes("-alpha", 0)
 
1425
  root.destroy()
1426
  if not args.model_param:
1427
  print("\nNo ggml model file was selected. Exiting.")
1428
+ time.sleep(3)
1429
  sys.exit(2)
1430
 
1431
  else:
 
1435
  root.destroy()
1436
  if not args.model_param:
1437
  print("\nNo ggml model file was selected. Exiting.")
1438
+ time.sleep(3)
1439
  sys.exit(2)
1440
 
1441
  #A very simple and stripped down embedded horde worker with no dependencies
 
1480
  BRIDGE_AGENT = f"KoboldCppEmbedWorker:1:https://github.com/LostRuins/koboldcpp"
1481
  cluster = "https://horde.koboldai.net"
1482
  while exitcounter < 10:
1483
+ time.sleep(3)
1484
  readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
1485
  if readygo:
1486
  print("Embedded Horde Worker is started.")
 
1556
  time.sleep(1)
1557
  if exitcounter<100:
1558
  print("Horde Worker Shutdown - Too many errors.")
1559
+ time.sleep(3)
1560
  else:
1561
  print("Horde Worker Shutdown - Server Closing.")
1562
+ time.sleep(2)
1563
  sys.exit(2)
1564
 
1565
  def main(args):
 
1583
  except Exception as ex2:
1584
  print("File selection GUI unsupported. Please check command line: script.py --help")
1585
  print("Reason for no GUI: " + str(ex2))
1586
+ time.sleep(3)
1587
  sys.exit(2)
1588
 
1589
  if args.hordeconfig and args.hordeconfig[0]!="":
 
1627
  time.sleep(1)
1628
  if not os.path.exists(args.model_param):
1629
  print(f"Cannot find model file: {args.model_param}")
1630
+ time.sleep(3)
1631
  sys.exit(2)
1632
 
1633
  if args.lora and args.lora[0]!="":
1634
  if not os.path.exists(args.lora[0]):
1635
  print(f"Cannot find lora file: {args.lora[0]}")
1636
+ time.sleep(3)
1637
  sys.exit(2)
1638
  else:
1639
  args.lora[0] = os.path.abspath(args.lora[0])
1640
  if len(args.lora) > 1:
1641
  if not os.path.exists(args.lora[1]):
1642
  print(f"Cannot find lora base: {args.lora[1]}")
1643
+ time.sleep(3)
1644
  sys.exit(2)
1645
  else:
1646
  args.lora[1] = os.path.abspath(args.lora[1])
 
1661
 
1662
  if not loadok:
1663
  print("Could not load model: " + modelname)
1664
+ time.sleep(3)
1665
  sys.exit(3)
1666
  try:
1667
  basepath = os.path.abspath(os.path.dirname(__file__))
 
1689
 
1690
  if args.hordeconfig and len(args.hordeconfig)>4:
1691
  horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4]))
1692
+ horde_thread.daemon = True
1693
  horde_thread.start()
1694
 
1695
  print(f"Please connect to custom endpoint at {epurl}")
 
1716
  parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
1717
  parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
1718
  parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
1719
+ parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192,12288,16384], default=2048)
1720
+ parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512)
1721
  parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
1722
  parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
1723
  parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
 
1734
  compatgroup = parser.add_mutually_exclusive_group()
1735
  compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
1736
  compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
1737
+ compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', 'mmq'])
1738
  parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
1739
  parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
1740
 
llama-util.h CHANGED
@@ -149,6 +149,46 @@ struct llama_file {
149
  }
150
  };
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  #if defined(_WIN32)
153
  static std::string llama_format_win_err(DWORD err) {
154
  LPSTR buf;
@@ -179,7 +219,7 @@ struct llama_mmap {
179
  // prefetch/readahead impairs performance on NUMA systems
180
  if (numa) { prefetch = 0; }
181
  #ifdef __linux__
182
- if (prefetch) { flags |= MAP_POPULATE; }
183
  #endif
184
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
185
  if (addr == MAP_FAILED) {
@@ -247,7 +287,7 @@ struct llama_mmap {
247
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
248
  #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
249
  #else
250
- printf("\nPrefetchVirtualMemory skipped in failsafe mode.");
251
  #endif
252
  }
253
 
 
149
  }
150
  };
151
 
152
+ // llama_context_data
153
+ struct llama_data_context {
154
+ virtual void write(const void * src, size_t size) = 0;
155
+ virtual size_t get_size_written() = 0;
156
+ virtual ~llama_data_context() = default;
157
+ };
158
+
159
+ struct llama_data_buffer_context : llama_data_context {
160
+ uint8_t* ptr;
161
+ size_t size_written = 0;
162
+
163
+ llama_data_buffer_context(uint8_t * p) : ptr(p) {}
164
+
165
+ void write(const void * src, size_t size) override {
166
+ memcpy(ptr, src, size);
167
+ ptr += size;
168
+ size_written += size;
169
+ }
170
+
171
+ size_t get_size_written() override {
172
+ return size_written;
173
+ }
174
+ };
175
+
176
+ struct llama_data_file_context : llama_data_context {
177
+ llama_file* file;
178
+ size_t size_written = 0;
179
+
180
+ llama_data_file_context(llama_file * f) : file(f) {}
181
+
182
+ void write(const void * src, size_t size) override {
183
+ file->write_raw(src, size);
184
+ size_written += size;
185
+ }
186
+
187
+ size_t get_size_written() override {
188
+ return size_written;
189
+ }
190
+ };
191
+
192
  #if defined(_WIN32)
193
  static std::string llama_format_win_err(DWORD err) {
194
  LPSTR buf;
 
219
  // prefetch/readahead impairs performance on NUMA systems
220
  if (numa) { prefetch = 0; }
221
  #ifdef __linux__
222
+ if (prefetch >= file->size) { flags |= MAP_POPULATE; }
223
  #endif
224
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
225
  if (addr == MAP_FAILED) {
 
287
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
288
  #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
289
  #else
290
+ printf("\nPrefetchVirtualMemory skipped in compatibility mode.\n");
291
  #endif
292
  }
293
 
llama.cpp CHANGED
@@ -57,8 +57,14 @@
57
  #pragma warning(disable: 4244 4267) // possible loss of data
58
  #endif
59
 
 
 
 
 
60
  #define LLAMA_USE_SCRATCH
61
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
 
 
62
 
63
  // available llama models
64
  enum e_model {
@@ -144,7 +150,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
144
  }
145
 
146
  // amount of VRAM needed per batch size to hold temporary results
147
- // the values for 3b and 65b are not derived from testing but instead chosen conservatively
148
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
149
  {
150
  static std::map<e_model, size_t> k_sizes = {
@@ -152,14 +158,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
152
  { MODEL_7B, 512ull * kB },
153
  { MODEL_13B, 640ull * kB },
154
  { MODEL_30B, 768ull * kB },
155
- { MODEL_65B, 1536ull * kB },
156
- { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
157
  };
158
  return k_sizes;
159
  }
160
 
161
  // amount of VRAM needed per batch size and context to hold temporary results
162
- // the values for 3b and 65b are not derived from testing but instead chosen conservatively
163
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
164
  {
165
  static std::map<e_model, size_t> k_sizes = {
@@ -167,8 +173,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
167
  { MODEL_7B, 128ull },
168
  { MODEL_13B, 160ull },
169
  { MODEL_30B, 208ull },
170
- { MODEL_65B, 416ull },
171
- { MODEL_70B, 416ull }, // TODO (likely can be reduced)
172
  };
173
  return k_sizes;
174
  }
@@ -328,13 +334,22 @@ struct llama_model {
328
 
329
  struct llama_context {
330
  llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
331
- #ifdef GGML_USE_METAL
332
  ~llama_context() {
 
 
 
 
333
  if (ctx_metal) {
334
  ggml_metal_free(ctx_metal);
335
  }
336
- }
337
  #endif
 
 
 
 
 
 
 
338
  std::mt19937 rng;
339
 
340
  bool has_evaluated_once = false;
@@ -372,7 +387,17 @@ struct llama_context {
372
  // memory buffers used to evaluate the model
373
  // TODO: move in llama_state
374
  llama_ctx_buffer buf_compute;
 
 
 
 
 
 
 
375
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
 
 
376
 
377
  #ifdef GGML_USE_METAL
378
  ggml_metal_context * ctx_metal = NULL;
@@ -382,9 +407,6 @@ struct llama_context {
382
  ggml_mpi_context * ctx_mpi = NULL;
383
  #endif
384
 
385
- int buf_last = 0;
386
- size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
387
-
388
  void use_buf(struct ggml_context * ctx, int i) {
389
  #if defined(LLAMA_USE_SCRATCH)
390
  size_t last_size = 0;
@@ -726,12 +748,12 @@ struct llama_model_loader {
726
 
727
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
728
  size_t data_size = 0;
729
- size_t prefetch_size = 0;
730
  size_t lock_size = 0;
731
  for (const llama_load_tensor & lt : tensors_map.tensors) {
732
  data_size += lt.size;
733
- if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
734
- prefetch_size += lt.size;
735
  }
736
  }
737
 
@@ -880,6 +902,7 @@ struct llama_context_params llama_context_default_params() {
880
  /*.progress_callback =*/ nullptr,
881
  /*.progress_callback_user_data =*/ nullptr,
882
  /*.low_vram =*/ false,
 
883
  /*.f16_kv =*/ true,
884
  /*.logits_all =*/ false,
885
  /*.vocab_only =*/ false,
@@ -914,6 +937,11 @@ bool llama_mlock_supported() {
914
  return llama_mlock::SUPPORTED;
915
  }
916
 
 
 
 
 
 
917
  void llama_backend_init(bool numa) {
918
  ggml_time_init();
919
 
@@ -1007,6 +1035,7 @@ static void llama_model_load_internal(
1007
  int n_gpu_layers,
1008
  int main_gpu,
1009
  const float * tensor_split,
 
1010
  float rope_freq_base,
1011
  float rope_freq_scale,
1012
  bool low_vram,
@@ -1018,7 +1047,7 @@ static void llama_model_load_internal(
1018
  void * progress_callback_user_data) {
1019
 
1020
  model.t_start_us = ggml_time_us();
1021
- size_t blasbatchmul = (n_batch>512?2:1);
1022
 
1023
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1024
 
@@ -1052,7 +1081,7 @@ static void llama_model_load_internal(
1052
  // LLaMAv2
1053
  // TODO: temporary until GGUF
1054
  //patch for llama2 gqa
1055
- if (model.type == e_model::MODEL_65B && hparams.n_mult >= 4096) {
1056
  fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
1057
  n_gqa = 8;
1058
  }
@@ -1141,9 +1170,11 @@ static void llama_model_load_internal(
1141
  }
1142
 
1143
  (void) main_gpu;
 
1144
  #if defined(GGML_USE_CUBLAS)
1145
  fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1146
  ggml_cuda_set_main_device(main_gpu);
 
1147
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1148
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1149
  #elif defined(GGML_USE_CLBLAST)
@@ -1237,12 +1268,16 @@ static void llama_model_load_internal(
1237
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1238
 
1239
  // this is the total memory required to run the inference
1240
- const size_t mem_required =
1241
  ctx_size +
1242
- mmapped_size - vram_weights + // weights in VRAM not in memory
 
 
 
1243
  blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1244
  blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) +
1245
  blasbatchmul*MEM_REQ_EVAL().at(model.type);
 
1246
 
1247
  // this is the memory required by one llama_state
1248
  const size_t mem_required_state =
@@ -1348,6 +1383,7 @@ static bool llama_model_load(
1348
  int n_gpu_layers,
1349
  int main_gpu,
1350
  const float * tensor_split,
 
1351
  float rope_freq_base,
1352
  float rope_freq_scale,
1353
  bool low_vram,
@@ -1358,7 +1394,8 @@ static bool llama_model_load(
1358
  llama_progress_callback progress_callback,
1359
  void *progress_callback_user_data) {
1360
  try {
1361
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
 
1362
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1363
  return true;
1364
  } catch (const std::exception & err) {
@@ -1367,32 +1404,15 @@ static bool llama_model_load(
1367
  }
1368
  }
1369
 
1370
- // evaluate the transformer
1371
- //
1372
- // - lctx: llama context
1373
- // - tokens: new batch of tokens to process
1374
- // - embd embeddings input
1375
- // - n_tokens number of tokens
1376
- // - n_past: the context size so far
1377
- // - n_threads: number of threads to use
1378
- //
1379
- static bool llama_eval_internal(
1380
  llama_context & lctx,
1381
  const llama_token * tokens,
1382
  const float * embd,
1383
  int n_tokens,
1384
- int n_past,
1385
- int n_threads,
1386
- const char * cgraph_fname) {
1387
 
1388
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1389
 
1390
- #ifdef GGML_USE_MPI
1391
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1392
- #endif
1393
-
1394
- const int64_t t_start_us = ggml_time_us();
1395
-
1396
  const int N = n_tokens;
1397
 
1398
  const auto & model = lctx.model;
@@ -1408,10 +1428,8 @@ static bool llama_eval_internal(
1408
  const int64_t n_head = hparams.n_head;
1409
  const int64_t n_head_kv = hparams.n_head_kv;
1410
  const int64_t n_embd_head = hparams.n_embd_head();
1411
- const int64_t n_vocab = hparams.n_vocab;
1412
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
1413
 
1414
-
1415
  LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1416
 
1417
  const float freq_base = hparams.rope_freq_base;
@@ -1423,26 +1441,35 @@ static bool llama_eval_internal(
1423
  auto & mem_per_token = lctx.mem_per_token;
1424
  auto & buf_compute = lctx.buf_compute;
1425
 
 
1426
  struct ggml_init_params params = {
1427
  /*.mem_size =*/ buf_compute.size,
1428
  /*.mem_buffer =*/ buf_compute.addr,
1429
  /*.no_alloc =*/ false,
1430
  };
1431
 
1432
- struct ggml_context * ctx0 = ggml_init(params);
 
 
1433
 
1434
- ggml_cgraph gf = {};
1435
 
1436
- // for big prompts, if BLAS is enabled, it is better to use only one thread
1437
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1438
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1439
 
1440
  struct ggml_tensor * cur;
1441
  struct ggml_tensor * inpL;
1442
 
1443
  if (tokens) {
1444
  struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
 
 
 
 
 
 
 
1445
  memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
 
1446
  ggml_set_name(inp_tokens, "inp_tokens");
1447
 
1448
  inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
@@ -1452,7 +1479,15 @@ static bool llama_eval_internal(
1452
  #endif
1453
 
1454
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
 
 
 
 
 
 
 
1455
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
 
1456
  }
1457
 
1458
  const int i_gpu_start = n_layer - n_gpu_layers;
@@ -1479,6 +1514,17 @@ static bool llama_eval_internal(
1479
  }
1480
  #endif // GGML_USE_CUBLAS
1481
 
 
 
 
 
 
 
 
 
 
 
 
1482
  for (int il = 0; il < n_layer; ++il) {
1483
  ggml_format_name(inpL, "layer_inp_%d", il);
1484
 
@@ -1548,8 +1594,8 @@ static bool llama_eval_internal(
1548
  ggml_set_name(v, "v");
1549
 
1550
  // important: storing RoPE-ed version of K in the KV cache!
1551
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1552
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
1553
  }
1554
 
1555
  struct ggml_tensor * Q =
@@ -1574,9 +1620,6 @@ static bool llama_eval_internal(
1574
  ggml_set_name(KQ, "KQ");
1575
 
1576
  // KQ_scaled = KQ / sqrt(n_embd_head)
1577
- struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
1578
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1579
-
1580
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1581
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1582
  offload_func_kq(KQ_scaled);
@@ -1692,9 +1735,6 @@ static bool llama_eval_internal(
1692
 
1693
  lctx.use_buf(ctx0, 0);
1694
 
1695
- // used at the end to optionally extract the embeddings
1696
- struct ggml_tensor * embeddings = NULL;
1697
-
1698
  // norm
1699
  {
1700
  cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
@@ -1705,8 +1745,6 @@ static bool llama_eval_internal(
1705
  cur = ggml_mul(ctx0, cur, model.norm);
1706
  // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1707
  ggml_set_name(cur, "result_norm");
1708
-
1709
- embeddings = cur;
1710
  }
1711
 
1712
  // lm_head
@@ -1718,23 +1756,103 @@ static bool llama_eval_internal(
1718
  // logits -> probs
1719
  //cur = ggml_soft_max_inplace(ctx0, cur);
1720
 
1721
- // run the computation
1722
- ggml_build_forward_expand(&gf, cur);
1723
 
1724
- // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1725
 
1726
  #if GGML_USE_MPI
1727
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
 
1728
  #endif
1729
 
1730
  #ifdef GGML_USE_METAL
1731
  if (lctx.ctx_metal && N == 1) {
1732
- if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1733
- ggml_metal_graph_find_concurrency(lctx.ctx_metal,&gf);
1734
- }
 
1735
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1736
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
1737
- ggml_metal_get_tensor (lctx.ctx_metal, cur);
 
 
 
1738
  } else {
1739
  // IMPORTANT:
1740
  // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1752,34 +1870,32 @@ static bool llama_eval_internal(
1752
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1753
  }
1754
 
1755
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1756
  }
1757
  #else
1758
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
1759
  #endif
1760
 
1761
  #if GGML_USE_MPI
1762
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
1763
  #endif
1764
 
1765
  // update kv token count
1766
  lctx.kv_self.n = n_past + N;
1767
 
1768
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
1769
-
1770
  if (cgraph_fname) {
1771
- ggml_graph_export(&gf, cgraph_fname);
1772
  }
1773
 
1774
  #ifdef GGML_PERF
1775
  // print timing information per ggml operation (for debugging purposes)
1776
  // requires GGML_PERF to be defined
1777
- ggml_graph_print(&gf);
1778
  #endif
1779
 
1780
  // plot the computation graph in dot format (for debugging purposes)
1781
  //if (n_past%100 == 0) {
1782
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
1783
  //}
1784
 
1785
  // extract logits
@@ -1804,21 +1920,6 @@ static bool llama_eval_internal(
1804
  memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
1805
  }
1806
 
1807
- if (mem_per_token == 0) {
1808
- mem_per_token = ggml_used_mem(ctx0)/N;
1809
- }
1810
-
1811
- #if 0
1812
- printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1813
- ggml_used_mem(ctx0)/1024.0/1024.0,
1814
- lctx.get_buf_max_mem(0)/1024.0/1024.0,
1815
- lctx.get_buf_max_mem(1)/1024.0/1024.0,
1816
- lctx.work_buffer.size()/1024.0/1024.0,
1817
- n_past, N);
1818
- #endif
1819
-
1820
- ggml_free(ctx0);
1821
-
1822
  // measure the performance only for the single-token evals
1823
  if (N == 1) {
1824
  lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -1930,7 +2031,9 @@ struct llama_tokenizer {
1930
  if (token == vocab_.token_to_id.end()) {
1931
  // output any symbols that did not form tokens as bytes.
1932
  for (int j = 0; j < (int) symbol.n; ++j) {
1933
- llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
 
 
1934
  output.push_back(token_id);
1935
  }
1936
  } else {
@@ -3107,7 +3210,7 @@ struct llama_model * llama_load_model_from_file(
3107
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
3108
 
3109
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3110
- params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3111
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3112
  params.progress_callback_user_data)) {
3113
  delete model;
@@ -3136,7 +3239,7 @@ struct llama_context * llama_new_context_with_model(
3136
  params.seed = time(NULL);
3137
  }
3138
 
3139
- size_t blasbatchmul = (params.n_batch>512?2:1);
3140
 
3141
  unsigned cur_percentage = 0;
3142
  if (params.progress_callback == NULL) {
@@ -3186,10 +3289,47 @@ struct llama_context * llama_new_context_with_model(
3186
  ctx->embedding.resize(hparams.n_embd);
3187
  }
3188
 
3189
- ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3191
  ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
3192
  ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type));
 
3193
  }
3194
 
3195
  #ifdef GGML_USE_METAL
@@ -3259,9 +3399,6 @@ struct llama_context * llama_init_from_file(
3259
  }
3260
 
3261
  void llama_free(struct llama_context * ctx) {
3262
- if (ctx->model_owner) {
3263
- delete &ctx->model;
3264
- }
3265
  delete ctx;
3266
  }
3267
 
@@ -3620,10 +3757,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3620
  return s_total;
3621
  }
3622
 
3623
- // Copies the state to the specified destination address
3624
- size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3625
- uint8_t * out = dst;
3626
-
 
 
 
 
 
 
 
 
 
 
3627
  // copy rng
3628
  {
3629
  std::stringstream rng_ss;
@@ -3635,8 +3782,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3635
  memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
3636
  memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
3637
 
3638
- memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
3639
- memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
3640
  }
3641
 
3642
  // copy logits
@@ -3644,25 +3791,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3644
  const size_t logits_cap = ctx->logits.capacity();
3645
  const size_t logits_size = ctx->logits.size();
3646
 
3647
- memcpy(out, &logits_cap, sizeof(logits_cap)); out += sizeof(logits_cap);
3648
- memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
3649
 
3650
  if (logits_size) {
3651
- memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
3652
  }
3653
 
3654
- out += logits_cap * sizeof(float);
 
 
 
 
 
3655
  }
3656
 
3657
  // copy embeddings
3658
  {
3659
  const size_t embedding_size = ctx->embedding.size();
3660
 
3661
- memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
3662
 
3663
  if (embedding_size) {
3664
- memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
3665
- out += embedding_size * sizeof(float);
3666
  }
3667
  }
3668
 
@@ -3671,14 +3822,14 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3671
  const auto & kv_self = ctx->kv_self;
3672
  const auto & hparams = ctx->model.hparams;
3673
  const int n_layer = hparams.n_layer;
3674
- const int n_embd = hparams.n_embd;
3675
  const int n_ctx = hparams.n_ctx;
3676
 
3677
  const size_t kv_size = kv_self.buf.size;
3678
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
3679
 
3680
- memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
3681
- memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
3682
 
3683
  if (kv_size) {
3684
  const size_t elt_size = ggml_element_size(kv_self.k);
@@ -3687,12 +3838,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3687
  ggml_cgraph gf{};
3688
 
3689
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3690
- kout3d->data = out;
3691
- out += ggml_nbytes(kout3d);
3692
 
3693
  ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
3694
- vout3d->data = out;
3695
- out += ggml_nbytes(vout3d);
3696
 
3697
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
3698
  n_embd, kv_ntok, n_layer,
@@ -3707,15 +3858,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3707
  ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3708
 
3709
  ggml_free(cpy_ctx);
 
 
 
 
 
3710
  }
3711
  }
 
3712
 
3713
- const size_t written = out - dst;
3714
- const size_t max_size = llama_get_state_size(ctx);
3715
-
3716
- LLAMA_ASSERT(written <= max_size);
3717
 
3718
- return written;
3719
  }
3720
 
3721
  // Sets the state reading from the specified source address
@@ -3774,7 +3930,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3774
  const auto & kv_self = ctx->kv_self;
3775
  const auto & hparams = ctx->model.hparams;
3776
  const int n_layer = hparams.n_layer;
3777
- const int n_embd = hparams.n_embd;
3778
  const int n_ctx = hparams.n_ctx;
3779
 
3780
  size_t kv_size;
@@ -3900,15 +4056,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
3900
  file.write_u32((uint32_t) n_token_count);
3901
  file.write_raw(tokens, sizeof(llama_token) * n_token_count);
3902
 
3903
- // save the context state
3904
- {
3905
- const size_t n_state_size_max = llama_get_state_size(ctx);
3906
-
3907
- std::vector<uint8_t> state_data(n_state_size_max);
3908
- const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
3909
-
3910
- file.write_raw(state_data.data(), n_state_size_cur);
3911
- }
3912
 
3913
  return true;
3914
  }
 
57
  #pragma warning(disable: 4244 4267) // possible loss of data
58
  #endif
59
 
60
+ #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
61
+ #include "ggml-alloc.h"
62
+ #define LLAMA_USE_ALLOCATOR
63
+ #else
64
  #define LLAMA_USE_SCRATCH
65
  #define LLAMA_MAX_SCRATCH_BUFFERS 16
66
+ #endif
67
+
68
 
69
  // available llama models
70
  enum e_model {
 
150
  }
151
 
152
  // amount of VRAM needed per batch size to hold temporary results
153
+ // the values for 3b are not derived from testing but instead chosen conservatively
154
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
155
  {
156
  static std::map<e_model, size_t> k_sizes = {
 
158
  { MODEL_7B, 512ull * kB },
159
  { MODEL_13B, 640ull * kB },
160
  { MODEL_30B, 768ull * kB },
161
+ { MODEL_65B, 1360ull * kB },
162
+ { MODEL_70B, 1360ull * kB },
163
  };
164
  return k_sizes;
165
  }
166
 
167
  // amount of VRAM needed per batch size and context to hold temporary results
168
+ // the values for 3b are not derived from testing but instead chosen conservatively
169
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
170
  {
171
  static std::map<e_model, size_t> k_sizes = {
 
173
  { MODEL_7B, 128ull },
174
  { MODEL_13B, 160ull },
175
  { MODEL_30B, 208ull },
176
+ { MODEL_65B, 320ull },
177
+ { MODEL_70B, 320ull },
178
  };
179
  return k_sizes;
180
  }
 
334
 
335
  struct llama_context {
336
  llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
 
337
  ~llama_context() {
338
+ if (model_owner) {
339
+ delete &model;
340
+ }
341
+ #ifdef GGML_USE_METAL
342
  if (ctx_metal) {
343
  ggml_metal_free(ctx_metal);
344
  }
 
345
  #endif
346
+ #ifdef LLAMA_USE_ALLOCATOR
347
+ if (alloc) {
348
+ ggml_allocr_free(alloc);
349
+ }
350
+ #endif
351
+ }
352
+
353
  std::mt19937 rng;
354
 
355
  bool has_evaluated_once = false;
 
387
  // memory buffers used to evaluate the model
388
  // TODO: move in llama_state
389
  llama_ctx_buffer buf_compute;
390
+
391
+ #ifdef LLAMA_USE_ALLOCATOR
392
+ llama_ctx_buffer buf_alloc;
393
+ ggml_allocr * alloc = NULL;
394
+ #endif
395
+
396
+ #ifdef LLAMA_USE_SCRATCH
397
  llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
398
+ int buf_last = 0;
399
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
400
+ #endif
401
 
402
  #ifdef GGML_USE_METAL
403
  ggml_metal_context * ctx_metal = NULL;
 
407
  ggml_mpi_context * ctx_mpi = NULL;
408
  #endif
409
 
 
 
 
410
  void use_buf(struct ggml_context * ctx, int i) {
411
  #if defined(LLAMA_USE_SCRATCH)
412
  size_t last_size = 0;
 
748
 
749
  void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
750
  size_t data_size = 0;
751
+ size_t prefetch_size = file_loader->file.size;
752
  size_t lock_size = 0;
753
  for (const llama_load_tensor & lt : tensors_map.tensors) {
754
  data_size += lt.size;
755
+ if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
756
+ prefetch_size -= lt.size;
757
  }
758
  }
759
 
 
902
  /*.progress_callback =*/ nullptr,
903
  /*.progress_callback_user_data =*/ nullptr,
904
  /*.low_vram =*/ false,
905
+ /*.mul_mat_q =*/ false,
906
  /*.f16_kv =*/ true,
907
  /*.logits_all =*/ false,
908
  /*.vocab_only =*/ false,
 
937
  return llama_mlock::SUPPORTED;
938
  }
939
 
940
+ int get_blas_batch_mul(int batch)
941
+ {
942
+ return (batch>512?(batch>1024?4:2):1);
943
+ }
944
+
945
  void llama_backend_init(bool numa) {
946
  ggml_time_init();
947
 
 
1035
  int n_gpu_layers,
1036
  int main_gpu,
1037
  const float * tensor_split,
1038
+ const bool mul_mat_q,
1039
  float rope_freq_base,
1040
  float rope_freq_scale,
1041
  bool low_vram,
 
1047
  void * progress_callback_user_data) {
1048
 
1049
  model.t_start_us = ggml_time_us();
1050
+ size_t blasbatchmul = get_blas_batch_mul(n_batch);
1051
 
1052
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
1053
 
 
1081
  // LLaMAv2
1082
  // TODO: temporary until GGUF
1083
  //patch for llama2 gqa
1084
+ if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
1085
  fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
1086
  n_gqa = 8;
1087
  }
 
1170
  }
1171
 
1172
  (void) main_gpu;
1173
+ (void) mul_mat_q;
1174
  #if defined(GGML_USE_CUBLAS)
1175
  fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
1176
  ggml_cuda_set_main_device(main_gpu);
1177
+ ggml_cuda_set_mul_mat_q(mul_mat_q);
1178
  #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
1179
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
1180
  #elif defined(GGML_USE_CLBLAST)
 
1268
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1269
 
1270
  // this is the total memory required to run the inference
1271
+ size_t mem_required =
1272
  ctx_size +
1273
+ mmapped_size - vram_weights; // weights in VRAM not in memory
1274
+
1275
+ #ifndef LLAMA_USE_ALLOCATOR
1276
+ mem_required +=
1277
  blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
1278
  blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) +
1279
  blasbatchmul*MEM_REQ_EVAL().at(model.type);
1280
+ #endif
1281
 
1282
  // this is the memory required by one llama_state
1283
  const size_t mem_required_state =
 
1383
  int n_gpu_layers,
1384
  int main_gpu,
1385
  const float * tensor_split,
1386
+ const bool mul_mat_q,
1387
  float rope_freq_base,
1388
  float rope_freq_scale,
1389
  bool low_vram,
 
1394
  llama_progress_callback progress_callback,
1395
  void *progress_callback_user_data) {
1396
  try {
1397
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
1398
+ main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1399
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1400
  return true;
1401
  } catch (const std::exception & err) {
 
1404
  }
1405
  }
1406
 
1407
+ static struct ggml_cgraph * llama_build_graph(
 
 
 
 
 
 
 
 
 
1408
  llama_context & lctx,
1409
  const llama_token * tokens,
1410
  const float * embd,
1411
  int n_tokens,
1412
+ int n_past) {
 
 
1413
 
1414
  LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1415
 
 
 
 
 
 
 
1416
  const int N = n_tokens;
1417
 
1418
  const auto & model = lctx.model;
 
1428
  const int64_t n_head = hparams.n_head;
1429
  const int64_t n_head_kv = hparams.n_head_kv;
1430
  const int64_t n_embd_head = hparams.n_embd_head();
 
1431
  const int64_t n_embd_gqa = hparams.n_embd_gqa();
1432
 
 
1433
  LLAMA_ASSERT(n_embd_head == hparams.n_rot);
1434
 
1435
  const float freq_base = hparams.rope_freq_base;
 
1441
  auto & mem_per_token = lctx.mem_per_token;
1442
  auto & buf_compute = lctx.buf_compute;
1443
 
1444
+
1445
  struct ggml_init_params params = {
1446
  /*.mem_size =*/ buf_compute.size,
1447
  /*.mem_buffer =*/ buf_compute.addr,
1448
  /*.no_alloc =*/ false,
1449
  };
1450
 
1451
+ #ifdef LLAMA_USE_ALLOCATOR
1452
+ params.no_alloc = true;
1453
+ #endif
1454
 
1455
+ struct ggml_context * ctx0 = ggml_init(params);
1456
 
1457
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
 
 
1458
 
1459
  struct ggml_tensor * cur;
1460
  struct ggml_tensor * inpL;
1461
 
1462
  if (tokens) {
1463
  struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1464
+
1465
+ #ifdef LLAMA_USE_ALLOCATOR
1466
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
1467
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1468
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1469
+ }
1470
+ #else
1471
  memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
1472
+ #endif
1473
  ggml_set_name(inp_tokens, "inp_tokens");
1474
 
1475
  inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
 
1479
  #endif
1480
 
1481
  inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1482
+
1483
+ #ifdef LLAMA_USE_ALLOCATOR
1484
+ ggml_allocr_alloc(lctx.alloc, inpL);
1485
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1486
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1487
+ }
1488
+ #else
1489
  memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1490
+ #endif
1491
  }
1492
 
1493
  const int i_gpu_start = n_layer - n_gpu_layers;
 
1514
  }
1515
  #endif // GGML_USE_CUBLAS
1516
 
1517
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
1518
+ #ifdef LLAMA_USE_ALLOCATOR
1519
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
1520
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
1521
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1522
+ }
1523
+ #else
1524
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
1525
+ #endif
1526
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
1527
+
1528
  for (int il = 0; il < n_layer; ++il) {
1529
  ggml_format_name(inpL, "layer_inp_%d", il);
1530
 
 
1594
  ggml_set_name(v, "v");
1595
 
1596
  // important: storing RoPE-ed version of K in the KV cache!
1597
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
1598
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
1599
  }
1600
 
1601
  struct ggml_tensor * Q =
 
1620
  ggml_set_name(KQ, "KQ");
1621
 
1622
  // KQ_scaled = KQ / sqrt(n_embd_head)
 
 
 
1623
  // KQ_scaled shape [n_past + N, N, n_head, 1]
1624
  struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
1625
  offload_func_kq(KQ_scaled);
 
1735
 
1736
  lctx.use_buf(ctx0, 0);
1737
 
 
 
 
1738
  // norm
1739
  {
1740
  cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
 
1745
  cur = ggml_mul(ctx0, cur, model.norm);
1746
  // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
1747
  ggml_set_name(cur, "result_norm");
 
 
1748
  }
1749
 
1750
  // lm_head
 
1756
  // logits -> probs
1757
  //cur = ggml_soft_max_inplace(ctx0, cur);
1758
 
1759
+ ggml_build_forward_expand(gf, cur);
 
1760
 
1761
+ if (mem_per_token == 0) {
1762
+ mem_per_token = ggml_used_mem(ctx0)/N;
1763
+ }
1764
+
1765
+ #if 0
1766
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
1767
+ ggml_used_mem(ctx0)/1024.0/1024.0,
1768
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
1769
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1770
+ lctx.work_buffer.size()/1024.0/1024.0,
1771
+ n_past, N);
1772
+ #endif
1773
+
1774
+ ggml_free(ctx0);
1775
+
1776
+ return gf;
1777
+ }
1778
+
1779
+ // evaluate the transformer
1780
+ //
1781
+ // - lctx: llama context
1782
+ // - tokens: new batch of tokens to process
1783
+ // - embd embeddings input
1784
+ // - n_tokens number of tokens
1785
+ // - n_past: the context size so far
1786
+ // - n_threads: number of threads to use
1787
+ //
1788
+ static bool llama_eval_internal(
1789
+ llama_context & lctx,
1790
+ const llama_token * tokens,
1791
+ const float * embd,
1792
+ int n_tokens,
1793
+ int n_past,
1794
+ int n_threads,
1795
+ const char * cgraph_fname) {
1796
+
1797
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
1798
+
1799
+ const int64_t t_start_us = ggml_time_us();
1800
+
1801
+ #ifdef GGML_USE_MPI
1802
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
1803
+ #endif
1804
+
1805
+ const int N = n_tokens;
1806
+
1807
+ const auto & model = lctx.model;
1808
+ const auto & hparams = model.hparams;
1809
+
1810
+ const auto & kv_self = lctx.kv_self;
1811
+
1812
+ LLAMA_ASSERT(!!kv_self.ctx);
1813
+
1814
+ const int64_t n_embd = hparams.n_embd;
1815
+ const int64_t n_vocab = hparams.n_vocab;
1816
+
1817
+ #ifdef LLAMA_USE_ALLOCATOR
1818
+ ggml_allocr_reset(lctx.alloc);
1819
+ #endif
1820
+
1821
+ ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
1822
+
1823
+ #ifdef LLAMA_USE_ALLOCATOR
1824
+ ggml_allocr_alloc_graph(lctx.alloc, gf);
1825
+ #endif
1826
+
1827
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
1828
+
1829
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
1830
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1831
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1832
+
1833
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
1834
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
1835
+
1836
+ LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
1837
+ LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
1838
 
1839
  #if GGML_USE_MPI
1840
+ const int64_t n_layer = hparams.n_layer;
1841
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
1842
  #endif
1843
 
1844
  #ifdef GGML_USE_METAL
1845
  if (lctx.ctx_metal && N == 1) {
1846
+ // TODO: disabled until #2413 is resolved
1847
+ //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
1848
+ // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
1849
+ //}
1850
  ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
1851
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
1852
+ ggml_metal_get_tensor (lctx.ctx_metal, res);
1853
+ if (!lctx.embedding.empty()) {
1854
+ ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
1855
+ }
1856
  } else {
1857
  // IMPORTANT:
1858
  // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
 
1870
  ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
1871
  }
1872
 
1873
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1874
  }
1875
  #else
1876
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
1877
  #endif
1878
 
1879
  #if GGML_USE_MPI
1880
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
1881
  #endif
1882
 
1883
  // update kv token count
1884
  lctx.kv_self.n = n_past + N;
1885
 
 
 
1886
  if (cgraph_fname) {
1887
+ ggml_graph_export(gf, cgraph_fname);
1888
  }
1889
 
1890
  #ifdef GGML_PERF
1891
  // print timing information per ggml operation (for debugging purposes)
1892
  // requires GGML_PERF to be defined
1893
+ ggml_graph_print(gf);
1894
  #endif
1895
 
1896
  // plot the computation graph in dot format (for debugging purposes)
1897
  //if (n_past%100 == 0) {
1898
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
1899
  //}
1900
 
1901
  // extract logits
 
1920
  memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
1921
  }
1922
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1923
  // measure the performance only for the single-token evals
1924
  if (N == 1) {
1925
  lctx.t_eval_us += ggml_time_us() - t_start_us;
 
2031
  if (token == vocab_.token_to_id.end()) {
2032
  // output any symbols that did not form tokens as bytes.
2033
  for (int j = 0; j < (int) symbol.n; ++j) {
2034
+ // NOTE: old version, before #2420 - not sure what are the implications of this
2035
+ //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
2036
+ llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
2037
  output.push_back(token_id);
2038
  }
2039
  } else {
 
3210
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
3211
 
3212
  if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
3213
+ params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
3214
  memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
3215
  params.progress_callback_user_data)) {
3216
  delete model;
 
3239
  params.seed = time(NULL);
3240
  }
3241
 
3242
+ size_t blasbatchmul = get_blas_batch_mul(params.n_batch);
3243
 
3244
  unsigned cur_percentage = 0;
3245
  if (params.progress_callback == NULL) {
 
3289
  ctx->embedding.resize(hparams.n_embd);
3290
  }
3291
 
3292
+ #ifdef LLAMA_USE_ALLOCATOR
3293
+ {
3294
+ static const size_t tensor_alignment = 32;
3295
+ // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
3296
+ ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
3297
+
3298
+ // create measure allocator
3299
+ ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
3300
+
3301
+ // build worst-case graph
3302
+ int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
3303
+ int n_past = hparams.n_ctx - n_tokens;
3304
+ llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
3305
+ ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
3306
+
3307
+ // measure memory requirements for the graph
3308
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
3309
+
3310
+ fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
3311
 
3312
+ // debug - for comparison with scratch buffer
3313
+ //size_t prev_req =
3314
+ // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
3315
+ // MEM_REQ_SCRATCH1().at(ctx->model.type) +
3316
+ // MEM_REQ_EVAL().at(ctx->model.type);
3317
+ //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
3318
+
3319
+ // recreate allocator with exact memory requirements
3320
+ ggml_allocr_free(ctx->alloc);
3321
+
3322
+ ctx->buf_alloc.resize(alloc_size);
3323
+ ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3324
+ }
3325
+ #else
3326
+ ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
3327
+ #endif
3328
+
3329
+ #ifdef LLAMA_USE_SCRATCH
3330
  ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
3331
  ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type));
3332
+ #endif
3333
  }
3334
 
3335
  #ifdef GGML_USE_METAL
 
3399
  }
3400
 
3401
  void llama_free(struct llama_context * ctx) {
 
 
 
3402
  delete ctx;
3403
  }
3404
 
 
3757
  return s_total;
3758
  }
3759
 
3760
+ /** copy state data into either a buffer or file depending on the passed in context
3761
+ *
3762
+ * file context:
3763
+ * llama_file file("/path", "wb");
3764
+ * llama_data_file_context data_ctx(&file);
3765
+ * llama_copy_state_data(ctx, &data_ctx);
3766
+ *
3767
+ * buffer context:
3768
+ * std::vector<uint8_t> buf(max_size, 0);
3769
+ * llama_data_buffer_context data_ctx(&buf.data());
3770
+ * llama_copy_state_data(ctx, &data_ctx);
3771
+ *
3772
+ */
3773
+ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
3774
  // copy rng
3775
  {
3776
  std::stringstream rng_ss;
 
3782
  memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
3783
  memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
3784
 
3785
+ data_ctx->write(&rng_size, sizeof(rng_size));
3786
+ data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
3787
  }
3788
 
3789
  // copy logits
 
3791
  const size_t logits_cap = ctx->logits.capacity();
3792
  const size_t logits_size = ctx->logits.size();
3793
 
3794
+ data_ctx->write(&logits_cap, sizeof(logits_cap));
3795
+ data_ctx->write(&logits_size, sizeof(logits_size));
3796
 
3797
  if (logits_size) {
3798
+ data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
3799
  }
3800
 
3801
+ // If there is a gap between the size and the capacity, write padding
3802
+ size_t padding_size = (logits_cap - logits_size) * sizeof(float);
3803
+ if (padding_size > 0) {
3804
+ std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
3805
+ data_ctx->write(padding.data(), padding_size);
3806
+ }
3807
  }
3808
 
3809
  // copy embeddings
3810
  {
3811
  const size_t embedding_size = ctx->embedding.size();
3812
 
3813
+ data_ctx->write(&embedding_size, sizeof(embedding_size));
3814
 
3815
  if (embedding_size) {
3816
+ data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
 
3817
  }
3818
  }
3819
 
 
3822
  const auto & kv_self = ctx->kv_self;
3823
  const auto & hparams = ctx->model.hparams;
3824
  const int n_layer = hparams.n_layer;
3825
+ const int n_embd = hparams.n_embd_gqa();
3826
  const int n_ctx = hparams.n_ctx;
3827
 
3828
  const size_t kv_size = kv_self.buf.size;
3829
  const int kv_ntok = llama_get_kv_cache_token_count(ctx);
3830
 
3831
+ data_ctx->write(&kv_size, sizeof(kv_size));
3832
+ data_ctx->write(&kv_ntok, sizeof(kv_ntok));
3833
 
3834
  if (kv_size) {
3835
  const size_t elt_size = ggml_element_size(kv_self.k);
 
3838
  ggml_cgraph gf{};
3839
 
3840
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
3841
+ std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
3842
+ kout3d->data = kout3d_data.data();
3843
 
3844
  ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
3845
+ std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
3846
+ vout3d->data = vout3d_data.data();
3847
 
3848
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
3849
  n_embd, kv_ntok, n_layer,
 
3858
  ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
3859
 
3860
  ggml_free(cpy_ctx);
3861
+
3862
+ // our data is now in the kout3d_data and vout3d_data buffers
3863
+ // write them to file
3864
+ data_ctx->write(kout3d_data.data(), kout3d_data.size());
3865
+ data_ctx->write(vout3d_data.data(), vout3d_data.size());
3866
  }
3867
  }
3868
+ }
3869
 
3870
+ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3871
+ llama_data_buffer_context data_ctx(dst);
3872
+ llama_copy_state_data_internal(ctx, &data_ctx);
 
3873
 
3874
+ return data_ctx.get_size_written();
3875
  }
3876
 
3877
  // Sets the state reading from the specified source address
 
3930
  const auto & kv_self = ctx->kv_self;
3931
  const auto & hparams = ctx->model.hparams;
3932
  const int n_layer = hparams.n_layer;
3933
+ const int n_embd = hparams.n_embd_gqa();
3934
  const int n_ctx = hparams.n_ctx;
3935
 
3936
  size_t kv_size;
 
4056
  file.write_u32((uint32_t) n_token_count);
4057
  file.write_raw(tokens, sizeof(llama_token) * n_token_count);
4058
 
4059
+ // save the context state using stream saving
4060
+ llama_data_file_context data_ctx(&file);
4061
+ llama_copy_state_data_internal(ctx, &data_ctx);
 
 
 
 
 
 
4062
 
4063
  return true;
4064
  }
llama.h CHANGED
@@ -108,6 +108,7 @@ extern "C" {
108
 
109
  // Keep the booleans together to avoid misalignment during copy-by-value.
110
  bool low_vram; // if true, reduce VRAM usage at the cost of performance
 
111
  bool f16_kv; // use fp16 for KV cache
112
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
113
  bool vocab_only; // only load the vocabulary, no weights
 
108
 
109
  // Keep the booleans together to avoid misalignment during copy-by-value.
110
  bool low_vram; // if true, reduce VRAM usage at the cost of performance
111
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
112
  bool f16_kv; // use fp16 for KV cache
113
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
114
  bool vocab_only; // only load the vocabulary, no weights
make_old_pyinstaller.bat CHANGED
@@ -1,4 +1,4 @@
1
  echo This file is only for my own usage, please do not use it. I am lazy.
2
 
3
  set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH%
4
- PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp_nocuda.exe"
 
1
  echo This file is only for my own usage, please do not use it. I am lazy.
2
 
3
  set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH%
4
+ PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp_nocuda.exe"
make_old_pyinstaller_cuda.bat CHANGED
@@ -1,4 +1,4 @@
1
  echo This file is only for my own usage, please do not use it. I am lazy.
2
 
3
  set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH%
4
- PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./nikogreen.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublas64_11.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cudart64_110.dll;." --add-data "./msvcp140.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe"
 
1
  echo This file is only for my own usage, please do not use it. I am lazy.
2
 
3
  set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH%
4
+ PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./nikogreen.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublas64_11.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cudart64_110.dll;." --add-data "./msvcp140.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe"
make_pyinstaller.bat CHANGED
@@ -1 +1 @@
1
- PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe"
 
1
+ PyInstaller --noconfirm --onefile --clean --console --collect-all customtkinter --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe"
make_pyinstaller.sh CHANGED
@@ -5,7 +5,7 @@ pyinstaller --noconfirm --onefile --clean --console --collect-all customtkinter
5
  --add-data "./koboldcpp.so:." \
6
  --add-data "./koboldcpp_openblas.so:." \
7
  --add-data "./koboldcpp_failsafe.so:." \
8
- --add-data "./koboldcpp_openblas_noavx2.so:." \
9
  --add-data "./koboldcpp_clblast.so:." \
10
  --add-data "./rwkv_vocab.embd:." \
11
  --add-data "./rwkv_world_vocab.embd:." \
 
5
  --add-data "./koboldcpp.so:." \
6
  --add-data "./koboldcpp_openblas.so:." \
7
  --add-data "./koboldcpp_failsafe.so:." \
8
+ --add-data "./koboldcpp_noavx2.so:." \
9
  --add-data "./koboldcpp_clblast.so:." \
10
  --add-data "./rwkv_vocab.embd:." \
11
  --add-data "./rwkv_world_vocab.embd:." \
make_pyinstaller_hybrid_henk.bat CHANGED
@@ -2,4 +2,4 @@ cd /d "%~dp0"
2
  copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cudart64_110.dll" .\ /Y
3
  copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublasLt64_11.dll" .\ /Y
4
  copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublas64_11.dll" .\ /Y
5
- PyInstaller --noconfirm --onefile --collect-all customtkinter --clean --console --icon ".\niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cudart64_110.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cublas64_11.dll;." --add-data "./rwkv_vocab.embd;." --add-data "C:/Windows/System32/msvcp140.dll;." --add-data "C:/Windows/System32/vcruntime140_1.dll;." "./koboldcpp.py" -n "koboldcpp.exe"
 
2
  copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cudart64_110.dll" .\ /Y
3
  copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublasLt64_11.dll" .\ /Y
4
  copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublas64_11.dll" .\ /Y
5
+ PyInstaller --noconfirm --onefile --collect-all customtkinter --clean --console --icon ".\niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cudart64_110.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cublas64_11.dll;." --add-data "./rwkv_vocab.embd;." --add-data "C:/Windows/System32/msvcp140.dll;." --add-data "C:/Windows/System32/vcruntime140_1.dll;." "./koboldcpp.py" -n "koboldcpp.exe"