Illumotion commited on
Commit
81bf9b4
1 Parent(s): 69fb50e

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitignore CHANGED
@@ -1,5 +1,6 @@
1
  *.o
2
  *.a
 
3
  .DS_Store
4
  .build/
5
  .cache/
@@ -36,6 +37,7 @@ out/
36
  /vdot
37
  /server
38
  /Pipfile
 
39
  /libllama.so
40
 
41
  arm_neon.h
@@ -64,4 +66,5 @@ koboldcpp.dll
64
  koboldcpp_failsafe.dll
65
  koboldcpp_openblas.dll
66
  koboldcpp_openblas_noavx2.dll
67
- koboldcpp_clblast.dll
 
 
1
  *.o
2
  *.a
3
+ *.so
4
  .DS_Store
5
  .build/
6
  .cache/
 
37
  /vdot
38
  /server
39
  /Pipfile
40
+ /embd-input-test
41
  /libllama.so
42
 
43
  arm_neon.h
 
66
  koboldcpp_failsafe.dll
67
  koboldcpp_openblas.dll
68
  koboldcpp_openblas_noavx2.dll
69
+ koboldcpp_clblast.dll
70
+ koboldcpp_cublas.dll
CMakeLists.txt CHANGED
@@ -1,5 +1,5 @@
1
- # DO NOT USE THIS FILE.
2
- # IT'S ONLY FOR CUBLAS BUILD PURPOSES ON WINDOWS VISUAL STUDIO.
3
  # IT WILL NOT BE UPDATED OR MAINTAINED !!!
4
 
5
  message(STATUS "============== ============== ==============")
@@ -41,8 +41,12 @@ if (NOT MSVC)
41
  endif()
42
 
43
  # 3rd party libs
44
- option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
45
-
 
 
 
 
46
 
47
 
48
  #
@@ -69,8 +73,15 @@ if (LLAMA_CUBLAS)
69
 
70
  set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
71
  set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
 
72
 
73
  add_compile_definitions(GGML_USE_CUBLAS)
 
 
 
 
 
 
74
 
75
  if (LLAMA_STATIC)
76
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -83,8 +94,6 @@ if (LLAMA_CUBLAS)
83
  endif()
84
  endif()
85
 
86
-
87
-
88
  if (LLAMA_ALL_WARNINGS)
89
  if (NOT MSVC)
90
  set(c_flags
@@ -259,7 +268,8 @@ set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
259
  add_library(ggml_v2 OBJECT
260
  otherarch/ggml_v2.c
261
  otherarch/ggml_v2.h
262
- ${GGML_V2_CUDA_SOURCES})
 
263
  target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools)
264
  target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
265
  target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
@@ -273,7 +283,7 @@ target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
273
  target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
274
  set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
275
 
276
- add_library(gpttype_adapter
277
  gpttype_adapter.cpp)
278
  target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples)
279
  target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
@@ -287,13 +297,12 @@ if (GGML_CUDA_SOURCES)
287
  set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
288
  endif()
289
 
290
- set(TARGET koboldcpp)
291
  add_library(${TARGET} SHARED expose.cpp expose.h)
292
  target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples)
293
  target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
294
  set_target_properties(${TARGET} PROPERTIES PREFIX "")
295
- set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp")
296
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
297
  target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
298
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
299
-
 
1
+ # DO NOT USE THIS FILE.
2
+ # IT'S ONLY FOR CUBLAS BUILD PURPOSES ON WINDOWS VISUAL STUDIO.
3
  # IT WILL NOT BE UPDATED OR MAINTAINED !!!
4
 
5
  message(STATUS "============== ============== ==============")
 
41
  endif()
42
 
43
  # 3rd party libs
44
+ option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
45
+ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
46
+ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
47
+ option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
48
+ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
49
+ option(LLAMA_K_QUANTS "llama: use k-quants" ON)
50
 
51
 
52
  #
 
73
 
74
  set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
75
  set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
76
+ set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
77
 
78
  add_compile_definitions(GGML_USE_CUBLAS)
79
+ add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
80
+ add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
81
+ if (LLAMA_CUDA_DMMV_F16)
82
+ add_compile_definitions(GGML_CUDA_DMMV_F16)
83
+ endif()
84
+ add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
85
 
86
  if (LLAMA_STATIC)
87
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
 
94
  endif()
95
  endif()
96
 
 
 
97
  if (LLAMA_ALL_WARNINGS)
98
  if (NOT MSVC)
99
  set(c_flags
 
268
  add_library(ggml_v2 OBJECT
269
  otherarch/ggml_v2.c
270
  otherarch/ggml_v2.h
271
+ ${GGML_V2_CUDA_SOURCES}
272
+ ${GGML_V2_LEGACY_CUDA_SOURCES})
273
  target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools)
274
  target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
275
  target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 
283
  target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
284
  set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
285
 
286
+ add_library(gpttype_adapter
287
  gpttype_adapter.cpp)
288
  target_include_directories(gpttype_adapter PUBLIC . ./otherarch ./otherarch/tools ./examples)
289
  target_compile_features(gpttype_adapter PUBLIC cxx_std_11) # don't bump
 
297
  set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
298
  endif()
299
 
300
+ set(TARGET koboldcpp_cublas)
301
  add_library(${TARGET} SHARED expose.cpp expose.h)
302
  target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples)
303
  target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
304
  set_target_properties(${TARGET} PROPERTIES PREFIX "")
305
+ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
306
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
307
  target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
308
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
Dockerfile CHANGED
@@ -4,7 +4,7 @@ COPY . .
4
  RUN apt update \
5
  && apt install build-essential wget libopenblas-dev make -y \
6
  && make LLAMA_OPENBLAS=1 \
7
- && wget https://huggingface.co/Yoshiii/pygmalion-7b-ggml/resolve/main/pygmalion-7b-q5_K_M.bin\
8
  && apt remove build-essential wget make -y
9
 
10
  ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-7b-q5_K_M.bin", "--port", "7860"]
 
4
  RUN apt update \
5
  && apt install build-essential wget libopenblas-dev make -y \
6
  && make LLAMA_OPENBLAS=1 \
7
+ && wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \
8
  && apt remove build-essential wget make -y
9
 
10
  ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-7b-q5_K_M.bin", "--port", "7860"]
Makefile CHANGED
@@ -1,4 +1,4 @@
1
- default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast
2
  tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
3
  dev: koboldcpp_openblas
4
  dev2: koboldcpp_clblast
@@ -42,7 +42,7 @@ endif
42
 
43
  # keep standard at C11 and C++11
44
  CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
45
- CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC
46
  LDFLAGS =
47
 
48
  # these are used on windows, to build some libraries with extra old device compatibility
@@ -53,6 +53,13 @@ NONECFLAGS =
53
  OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
54
  CLBLAST_FLAGS = -DGGML_USE_CLBLAST
55
  FAILSAFE_FLAGS = -DUSE_FAILSAFE
 
 
 
 
 
 
 
56
 
57
  #lets try enabling everything
58
  CFLAGS += -pthread -s
@@ -133,10 +140,9 @@ endif
133
 
134
  # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
135
  ifdef LLAMA_CUBLAS
136
- CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
137
- CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
138
- LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
139
- OBJS += ggml-cuda.o ggml_v2-cuda.o
140
  NVCC = nvcc
141
  NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
142
  ifdef LLAMA_CUDA_DMMV_X
@@ -158,9 +164,11 @@ else
158
  NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
159
  endif
160
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
161
- $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
162
  ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
163
- $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 
 
164
  endif # LLAMA_CUBLAS
165
 
166
  ifdef LLAMA_METAL
@@ -197,7 +205,7 @@ FAILSAFE_BUILD =
197
  OPENBLAS_BUILD =
198
  OPENBLAS_NOAVX2_BUILD =
199
  CLBLAST_BUILD =
200
- CLBLAST_NOAVX2_BUILD =
201
 
202
  ifeq ($(OS),Windows_NT)
203
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
@@ -205,7 +213,11 @@ ifeq ($(OS),Windows_NT)
205
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
206
  OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
207
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
208
- CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
 
 
 
 
209
  else
210
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
211
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
@@ -216,20 +228,26 @@ else
216
  ifdef LLAMA_CLBLAST
217
  ifeq ($(UNAME_S),Darwin)
218
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
219
- CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
220
  else
221
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
222
- CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
223
  endif
224
  endif
225
 
 
 
 
 
226
  ifndef LLAMA_OPENBLAS
227
  ifndef LLAMA_CLBLAST
 
228
  OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
229
  endif
230
  endif
 
231
  endif
232
 
 
 
233
  #
234
  # Print build information
235
  #
@@ -259,8 +277,8 @@ ggml_openblas_noavx2.o: ggml.c ggml.h
259
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
260
  ggml_clblast.o: ggml.c ggml.h
261
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
262
- ggml_clblast_noavx2.o: ggml.c ggml.h
263
- $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
264
 
265
  #quants K
266
  k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
@@ -281,8 +299,8 @@ ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
281
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
282
  ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
283
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
284
- ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
285
- $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
286
 
287
  #extreme old version compat
288
  ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
@@ -311,9 +329,11 @@ gpttype_adapter.o: gpttype_adapter.cpp
311
  $(CXX) $(CXXFLAGS) -c $< -o $@
312
  gpttype_adapter_clblast.o: gpttype_adapter.cpp
313
  $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
 
 
314
 
315
  clean:
316
- rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so
317
 
318
  main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o $(OBJS)
319
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -332,8 +352,8 @@ koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml
332
  $(OPENBLAS_NOAVX2_BUILD)
333
  koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o $(OBJS)
334
  $(CLBLAST_BUILD)
335
- koboldcpp_clblast_noavx2: ggml_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants_noavx2.o $(OBJS)
336
- $(CLBLAST_NOAVX2_BUILD)
337
 
338
  quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o
339
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
1
+ default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast koboldcpp_cublas
2
  tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt
3
  dev: koboldcpp_openblas
4
  dev2: koboldcpp_clblast
 
42
 
43
  # keep standard at C11 and C++11
44
  CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
45
+ CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
46
  LDFLAGS =
47
 
48
  # these are used on windows, to build some libraries with extra old device compatibility
 
53
  OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
54
  CLBLAST_FLAGS = -DGGML_USE_CLBLAST
55
  FAILSAFE_FLAGS = -DUSE_FAILSAFE
56
+ ifdef LLAMA_CUBLAS
57
+ CUBLAS_FLAGS = -DGGML_USE_CUBLAS
58
+ else
59
+ CUBLAS_FLAGS =
60
+ endif
61
+ CUBLASLD_FLAGS =
62
+ CUBLAS_OBJS =
63
 
64
  #lets try enabling everything
65
  CFLAGS += -pthread -s
 
140
 
141
  # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
142
  ifdef LLAMA_CUBLAS
143
+ CUBLAS_FLAGS = -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
144
+ CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
145
+ CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
 
146
  NVCC = nvcc
147
  NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
148
  ifdef LLAMA_CUDA_DMMV_X
 
164
  NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
165
  endif
166
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
167
+ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
168
  ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
169
+ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
170
+ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
171
+ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
172
  endif # LLAMA_CUBLAS
173
 
174
  ifdef LLAMA_METAL
 
205
  OPENBLAS_BUILD =
206
  OPENBLAS_NOAVX2_BUILD =
207
  CLBLAST_BUILD =
208
+ CUBLAS_BUILD =
209
 
210
  ifeq ($(OS),Windows_NT)
211
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
 
213
  OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
214
  OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
215
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
216
+
217
+ ifdef LLAMA_CUBLAS
218
+ CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
219
+ endif
220
+
221
  else
222
  DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
223
  FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
 
228
  ifdef LLAMA_CLBLAST
229
  ifeq ($(UNAME_S),Darwin)
230
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
 
231
  else
232
  CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
 
233
  endif
234
  endif
235
 
236
+ ifdef LLAMA_CUBLAS
237
+ CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
238
+ endif
239
+
240
  ifndef LLAMA_OPENBLAS
241
  ifndef LLAMA_CLBLAST
242
+ ifndef LLAMA_CUBLAS
243
  OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
244
  endif
245
  endif
246
+ endif
247
  endif
248
 
249
+
250
+
251
  #
252
  # Print build information
253
  #
 
277
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
278
  ggml_clblast.o: ggml.c ggml.h
279
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
280
+ ggml_cublas.o: ggml.c ggml.h
281
+ $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
282
 
283
  #quants K
284
  k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
 
299
  $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
300
  ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
301
  $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
302
+ ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
303
+ $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
304
 
305
  #extreme old version compat
306
  ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
 
329
  $(CXX) $(CXXFLAGS) -c $< -o $@
330
  gpttype_adapter_clblast.o: gpttype_adapter.cpp
331
  $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
332
+ gpttype_adapter_cublas.o: gpttype_adapter.cpp
333
+ $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
334
 
335
  clean:
336
+ rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
337
 
338
  main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o $(OBJS)
339
  $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
352
  $(OPENBLAS_NOAVX2_BUILD)
353
  koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o $(OBJS)
354
  $(CLBLAST_BUILD)
355
+ koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o $(CUBLAS_OBJS) $(OBJS)
356
+ $(CUBLAS_BUILD)
357
 
358
  quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o
359
  $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
README.md CHANGED
@@ -1,7 +1,74 @@
1
- ---
2
- title: koboldcpp
3
- sdk: docker
4
- emoji: 📚
5
- colorFrom: blue
6
- colorTo: purple
7
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # koboldcpp
2
+
3
+ A self contained distributable from Concedo that exposes llama.cpp function bindings, allowing it to be used via a simulated Kobold API endpoint.
4
+
5
+ What does it mean? You get llama.cpp with a fancy UI, persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. In a tiny package around 20 MB in size, excluding model weights.
6
+
7
+ ![Preview](media/preview.png)
8
+
9
+ ## Usage
10
+ - **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo.
11
+ - Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts.
12
+ - Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places).
13
+ - To run, execute **koboldcpp.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. If you're not on windows, then run the script **KoboldCpp.py** after compiling the libraries.
14
+ - By default, you can connect to http://localhost:5001
15
+ - You can also run it using the command line `koboldcpp.exe [ggml_model.bin] [port]`. For info, please check `koboldcpp.exe --help`
16
+ - Big context still too slow? Try the `--smartcontext` flag to reduce prompt processing frequency. Also, you can try to run with your GPU using CLBlast, with `--useclblast` flag for a speedup
17
+ - Want even more speedup? Combine `--useclblast` with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload.
18
+ - If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`.
19
+
20
+ For more information, be sure to run the program with the `--help` flag.
21
+
22
+ ## OSX and Linux
23
+ - You will have to compile your binaries from source. A makefile is provided, simply run `make`
24
+ - If you want you can also link your own install of OpenBLAS manually with `make LLAMA_OPENBLAS=1`
25
+ - Alternatively, if you want you can also link your own install of CLBlast manually with `make LLAMA_CLBLAST=1`, for this you will need to obtain and link OpenCL and CLBlast libraries.
26
+ - For Arch Linux: Install `cblas` `openblas` and `clblast`.
27
+ - For Debian: Install `libclblast-dev` and `libopenblas-dev`.
28
+ - For a full featured build, do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1`
29
+ - After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]`
30
+ - Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
31
+
32
+ ## Compiling on Windows
33
+ - You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is:
34
+ - Use the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs!
35
+ - Make sure you are using the w64devkit integrated terminal, then run 'make' at the KoboldCpp source folder. This will create the .dll files.
36
+ - If you want to generate the .exe file, make sure you have the python module PyInstaller installed with pip ('pip install PyInstaller').
37
+ - Run the script make_pyinstaller.bat at a regular terminal (or Windows Explorer).
38
+ - The koboldcpp.exe file will be at your dist folder.
39
+ - If you wish to use your own version of the additional Windows libraries (OpenCL, CLBlast and OpenBLAS), you can do it with:
40
+ - OpenCL - tested with https://github.com/KhronosGroup/OpenCL-SDK . If you wish to compile it, follow the repository instructions. You will need vcpkg.
41
+ - CLBlast - tested with https://github.com/CNugteren/CLBlast . If you wish to compile it you will need to reference the OpenCL files. It will only generate the ".lib" file if you compile using MSVC.
42
+ - OpenBLAS - tested with https://github.com/xianyi/OpenBLAS .
43
+ - Move the respectives .lib files to the /lib folder of your project, overwriting the older files.
44
+ - Also, replace the existing versions of the corresponding .dll files located in the project directory root (e.g. libopenblas.dll).
45
+ - Make the KoboldCPP project using the instructions above.
46
+
47
+ ## Android (Termux) Alternative method
48
+ - See https://github.com/ggerganov/llama.cpp/pull/1828/files
49
+
50
+ ## CuBLAS?
51
+ - You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC. Note that support for CuBLAS is limited.
52
+
53
+ ## Considerations
54
+ - For Windows: No installation, single file executable, (It Just Works)
55
+ - Since v1.0.6, requires libopenblas, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without BLAS.
56
+ - Since v1.15, requires CLBlast if enabled, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without CLBlast.
57
+ - **I plan to keep backwards compatibility with ALL past llama.cpp AND alpaca.cpp models**. But you are also encouraged to reconvert/update your models if possible for best results.
58
+
59
+ ## License
60
+ - The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
61
+ - However, Kobold Lite is licensed under the AGPL v3.0 License
62
+ - The other files are also under the AGPL v3.0 License unless otherwise stated
63
+
64
+ ## Notes
65
+ - Generation delay scales linearly with original prompt length. If OpenBLAS is enabled then prompt ingestion becomes about 2-3x faster. This is automatic on windows, but will require linking on OSX and Linux. CLBlast speeds this up even further, and `--gpulayers` + `--useclblast` more so.
66
+ - I have heard of someone claiming a false AV positive report. The exe is a simple pyinstaller bundle that includes the necessary python scripts and dlls to run. If this still concerns you, you might wish to rebuild everything from source code using the makefile, and you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat`
67
+ - Supported GGML models:
68
+ - LLAMA (All versions including ggml, ggmf, ggjt v1,v2,v3, openllama, gpt4all). Supports CLBlast and OpenBLAS acceleration for all versions.
69
+ - GPT-2 (All versions, including legacy f16, newer format + quanitzed, cerebras, starcoder) Supports CLBlast and OpenBLAS acceleration for newer formats, no GPU layer offload.
70
+ - GPT-J (All versions including legacy f16, newer format + quantized, pyg.cpp, new pygmalion, janeway etc.) Supports CLBlast and OpenBLAS acceleration for newer formats, no GPU layer offload.
71
+ - RWKV (all formats except Q4_1_O).
72
+ - GPT-NeoX / Pythia / StableLM / Dolly / RedPajama
73
+ - MPT models (ggjt v3)
74
+ - Basically every single current and historical GGML format that has ever existed should be supported, except for bloomz.cpp due to lack of demand.
convert-lora-to-ggml.py CHANGED
@@ -113,6 +113,10 @@ with open(output_path, "wb") as fout:
113
 
114
  write_file_header(fout, params)
115
  for k, v in model.items():
 
 
 
 
116
  if k.endswith("lora_A.weight"):
117
  if v.dtype != torch.float16 and v.dtype != torch.float32:
118
  v = v.float()
@@ -120,7 +124,7 @@ with open(output_path, "wb") as fout:
120
  else:
121
  v = v.float()
122
 
123
- t = v.numpy()
124
  tname = translate_tensor_name(k)
125
  print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
126
  write_tensor_header(fout, tname, t.shape, t.dtype)
 
113
 
114
  write_file_header(fout, params)
115
  for k, v in model.items():
116
+ if k.endswith(".default.weight"):
117
+ k = k.replace(".default.weight", ".weight")
118
+ if k in ["llama_proj.weight", "llama_proj.bias"]:
119
+ continue
120
  if k.endswith("lora_A.weight"):
121
  if v.dtype != torch.float16 and v.dtype != torch.float32:
122
  v = v.float()
 
124
  else:
125
  v = v.float()
126
 
127
+ t = v.detach().numpy()
128
  tname = translate_tensor_name(k)
129
  print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
130
  write_tensor_header(fout, tname, t.shape, t.dtype)
cudart64_110.dll ADDED
Binary file (518 kB). View file
 
examples/CMakeLists.txt CHANGED
@@ -39,6 +39,7 @@ else()
39
  add_subdirectory(baby-llama)
40
  add_subdirectory(train-text-from-scratch)
41
  add_subdirectory(simple)
 
42
  if (LLAMA_METAL)
43
  add_subdirectory(metal)
44
  endif()
 
39
  add_subdirectory(baby-llama)
40
  add_subdirectory(train-text-from-scratch)
41
  add_subdirectory(simple)
42
+ add_subdirectory(embd-input)
43
  if (LLAMA_METAL)
44
  add_subdirectory(metal)
45
  endif()
examples/baby-llama/baby-llama.cpp CHANGED
@@ -566,8 +566,8 @@ struct ggml_tensor * forward(
566
  // wk shape [n_embd, n_embd, 1, 1]
567
  // Qcur shape [n_embd/n_head, n_head, N, 1]
568
  // Kcur shape [n_embd/n_head, n_head, N, 1]
569
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
570
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
571
 
572
  // store key and value to memory
573
  {
@@ -823,8 +823,8 @@ struct ggml_tensor * forward_batch(
823
  // wk shape [n_embd, n_embd, 1, 1]
824
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
825
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
826
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
827
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
828
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
829
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
830
 
@@ -1116,7 +1116,7 @@ struct ggml_tensor * forward_lora(
1116
  model->layers[il].wqb,
1117
  cur)),
1118
  n_embd/n_head, n_head, N),
1119
- n_past, n_rot, 0);
1120
  struct ggml_tensor * Kcur = ggml_rope(ctx0,
1121
  ggml_reshape_3d(ctx0,
1122
  ggml_mul_mat(ctx0,
@@ -1125,7 +1125,7 @@ struct ggml_tensor * forward_lora(
1125
  model->layers[il].wkb,
1126
  cur)),
1127
  n_embd/n_head, n_head, N),
1128
- n_past, n_rot, 0);
1129
 
1130
  // store key and value to memory
1131
  {
 
566
  // wk shape [n_embd, n_embd, 1, 1]
567
  // Qcur shape [n_embd/n_head, n_head, N, 1]
568
  // Kcur shape [n_embd/n_head, n_head, N, 1]
569
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
570
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
571
 
572
  // store key and value to memory
573
  {
 
823
  // wk shape [n_embd, n_embd, 1, 1]
824
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
825
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
826
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
827
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
828
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
829
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
830
 
 
1116
  model->layers[il].wqb,
1117
  cur)),
1118
  n_embd/n_head, n_head, N),
1119
+ n_past, n_rot, 0, 0);
1120
  struct ggml_tensor * Kcur = ggml_rope(ctx0,
1121
  ggml_reshape_3d(ctx0,
1122
  ggml_mul_mat(ctx0,
 
1125
  model->layers[il].wkb,
1126
  cur)),
1127
  n_embd/n_head, n_head, N),
1128
+ n_past, n_rot, 0, 0);
1129
 
1130
  // store key and value to memory
1131
  {
examples/common.cpp CHANGED
@@ -110,7 +110,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
110
  invalid_param = true;
111
  break;
112
  }
113
- params.seed = std::stoi(argv[i]);
114
  } else if (arg == "-t" || arg == "--threads") {
115
  if (++i >= argc) {
116
  invalid_param = true;
@@ -343,6 +343,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
343
  params.use_mmap = false;
344
  } else if (arg == "--mtest") {
345
  params.mem_test = true;
 
 
346
  } else if (arg == "--export") {
347
  params.export_cgraph = true;
348
  } else if (arg == "--verbose-prompt") {
@@ -414,13 +416,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
414
  exit(1);
415
  }
416
 
417
- #ifdef GGML_USE_CUBLAS
418
- if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
419
- fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
420
- exit(1);
421
- }
422
- #endif // GGML_USE_CUBLAS
423
-
424
  if (escape_prompt) {
425
  process_escapes(params.prompt);
426
  }
@@ -488,6 +483,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
488
  if (llama_mmap_supported()) {
489
  fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
490
  }
 
 
 
491
  #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
492
  fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
493
  fprintf(stderr, " number of layers to store in VRAM\n");
 
110
  invalid_param = true;
111
  break;
112
  }
113
+ params.seed = std::stoul(argv[i]);
114
  } else if (arg == "-t" || arg == "--threads") {
115
  if (++i >= argc) {
116
  invalid_param = true;
 
343
  params.use_mmap = false;
344
  } else if (arg == "--mtest") {
345
  params.mem_test = true;
346
+ } else if (arg == "--numa") {
347
+ params.numa = true;
348
  } else if (arg == "--export") {
349
  params.export_cgraph = true;
350
  } else if (arg == "--verbose-prompt") {
 
416
  exit(1);
417
  }
418
 
 
 
 
 
 
 
 
419
  if (escape_prompt) {
420
  process_escapes(params.prompt);
421
  }
 
483
  if (llama_mmap_supported()) {
484
  fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
485
  }
486
+ fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
487
+ fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
488
+ fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
489
  #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
490
  fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
491
  fprintf(stderr, " number of layers to store in VRAM\n");
examples/common.h CHANGED
@@ -22,7 +22,7 @@
22
  int32_t get_num_physical_cores();
23
 
24
  struct gpt_params {
25
- int32_t seed = -1; // RNG seed
26
  int32_t n_threads = get_num_physical_cores();
27
  int32_t n_predict = -1; // new tokens to predict
28
  int32_t n_ctx = 512; // context size
@@ -76,6 +76,7 @@ struct gpt_params {
76
  bool use_mmap = true; // use mmap for faster loads
77
  bool use_mlock = false; // use mlock to keep model in memory
78
  bool mem_test = false; // compute maximum memory usage
 
79
  bool export_cgraph = false; // export the computation graph
80
  bool verbose_prompt = false; // print prompt tokens before generation
81
  };
 
22
  int32_t get_num_physical_cores();
23
 
24
  struct gpt_params {
25
+ uint32_t seed = -1; // RNG seed
26
  int32_t n_threads = get_num_physical_cores();
27
  int32_t n_predict = -1; // new tokens to predict
28
  int32_t n_ctx = 512; // context size
 
76
  bool use_mmap = true; // use mmap for faster loads
77
  bool use_mlock = false; // use mlock to keep model in memory
78
  bool mem_test = false; // compute maximum memory usage
79
+ bool numa = false; // attempt optimizations that help on some NUMA systems
80
  bool export_cgraph = false; // export the computation graph
81
  bool verbose_prompt = false; // print prompt tokens before generation
82
  };
examples/embd-input/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ PandaGPT
2
+ MiniGPT-4
3
+ *.pth
4
+
examples/embd-input/CMakeLists.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set(TARGET embdinput)
2
+ add_library(${TARGET} embd-input-lib.cpp embd-input.h)
3
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
5
+ if(TARGET BUILD_INFO)
6
+ add_dependencies(${TARGET} BUILD_INFO)
7
+ endif()
8
+
9
+ set(TARGET embd-input-test)
10
+ add_executable(${TARGET} embd-input-test.cpp)
11
+ target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
12
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
13
+ if(TARGET BUILD_INFO)
14
+ add_dependencies(${TARGET} BUILD_INFO)
15
+ endif()
examples/embd-input/README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Examples for input embedding directly
2
+
3
+ ## Requirement
4
+ build `libembdinput.so`
5
+ run the following comman in main dir (../../).
6
+ ```
7
+ make
8
+ ```
9
+
10
+ ## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
11
+
12
+ 1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
13
+ 2. Convert it to ggml format.
14
+ 3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
15
+
16
+ ```
17
+ import torch
18
+
19
+ bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
20
+ pth_path = "./examples/embd_input/llava_projection.pth"
21
+
22
+ dic = torch.load(bin_path)
23
+ used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
24
+ torch.save({k: dic[k] for k in used_key}, pth_path)
25
+ ```
26
+ 4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
27
+
28
+
29
+ ## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
30
+
31
+ 1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
32
+ The `adapter_config.json` is
33
+ ```
34
+ {
35
+ "peft_type": "LORA",
36
+ "fan_in_fan_out": false,
37
+ "bias": null,
38
+ "modules_to_save": null,
39
+ "r": 32,
40
+ "lora_alpha": 32,
41
+ "lora_dropout": 0.1,
42
+ "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
43
+ }
44
+ ```
45
+ 2. Papare the `vicuna` v0 model.
46
+ 3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
47
+ 4. Clone the PandaGPT source.
48
+ ```
49
+ git clone https://github.com/yxuansu/PandaGPT
50
+ ```
51
+ 5. Install the requirement of PandaGPT.
52
+ 6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
53
+
54
+ ## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
55
+
56
+ 1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
57
+ 2. Clone the MiniGPT-4 source.
58
+ ```
59
+ git clone https://github.com/Vision-CAIR/MiniGPT-4/
60
+ ```
61
+ 3. Install the requirement of PandaGPT.
62
+ 4. Papare the `vicuna` v0 model.
63
+ 5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
examples/embd-input/embd-input-lib.cpp ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Defines sigaction on msys:
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #endif
5
+
6
+ #include "embd-input.h"
7
+
8
+ #include <cassert>
9
+ #include <cinttypes>
10
+ #include <cmath>
11
+ #include <cstdio>
12
+ #include <cstring>
13
+ #include <ctime>
14
+ #include <fstream>
15
+ #include <iostream>
16
+ #include <string>
17
+ #include <vector>
18
+
19
+ static llama_context ** g_ctx;
20
+
21
+ extern "C" {
22
+
23
+ struct MyModel* create_mymodel(int argc, char ** argv) {
24
+ gpt_params params;
25
+
26
+ if (gpt_params_parse(argc, argv, params) == false) {
27
+ return nullptr;
28
+ }
29
+
30
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
31
+
32
+ if (params.seed < 0) {
33
+ params.seed = time(NULL);
34
+ }
35
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
36
+
37
+ llama_init_backend(params.numa);
38
+
39
+ llama_model * model;
40
+ llama_context * ctx;
41
+
42
+ g_ctx = &ctx;
43
+
44
+ // load the model and apply lora adapter, if any
45
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
46
+ if (model == NULL) {
47
+ fprintf(stderr, "%s: error: unable to load model\n", __func__);
48
+ return nullptr;
49
+ }
50
+
51
+ // print system information
52
+ {
53
+ fprintf(stderr, "\n");
54
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
55
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
56
+ }
57
+ struct MyModel * ret = new MyModel();
58
+ ret->ctx = ctx;
59
+ ret->params = params;
60
+ ret->n_past = 0;
61
+ // printf("ctx: %d\n", ret->ctx);
62
+ return ret;
63
+ }
64
+
65
+ void free_mymodel(struct MyModel * mymodel) {
66
+ llama_context * ctx = mymodel->ctx;
67
+ llama_print_timings(ctx);
68
+ llama_free(ctx);
69
+ delete mymodel;
70
+ }
71
+
72
+
73
+ bool eval_float(void * model, float * input, int N){
74
+ MyModel * mymodel = (MyModel*)model;
75
+ llama_context * ctx = mymodel->ctx;
76
+ gpt_params params = mymodel->params;
77
+ int n_emb = llama_n_embd(ctx);
78
+ int n_past = mymodel->n_past;
79
+ int n_batch = N; // params.n_batch;
80
+
81
+ for (int i = 0; i < (int) N; i += n_batch) {
82
+ int n_eval = (int) N - i;
83
+ if (n_eval > n_batch) {
84
+ n_eval = n_batch;
85
+ }
86
+ if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
87
+ fprintf(stderr, "%s : failed to eval\n", __func__);
88
+ return false;
89
+ }
90
+ n_past += n_eval;
91
+ }
92
+ mymodel->n_past = n_past;
93
+ return true;
94
+ }
95
+
96
+ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
97
+ MyModel * mymodel = (MyModel* )model;
98
+ llama_context * ctx;
99
+ ctx = mymodel->ctx;
100
+ gpt_params params = mymodel->params;
101
+ int n_past = mymodel->n_past;
102
+ for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
103
+ int n_eval = (int) tokens.size() - i;
104
+ if (n_eval > params.n_batch) {
105
+ n_eval = params.n_batch;
106
+ }
107
+ if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
108
+ fprintf(stderr, "%s : failed to eval\n", __func__);
109
+ return false;
110
+ }
111
+ n_past += n_eval;
112
+ }
113
+ mymodel->n_past = n_past;
114
+ return true;
115
+ }
116
+
117
+ bool eval_id(struct MyModel* mymodel, int id) {
118
+ std::vector<llama_token> tokens;
119
+ tokens.push_back(id);
120
+ return eval_tokens(mymodel, tokens);
121
+ }
122
+
123
+ bool eval_string(struct MyModel * mymodel,const char* str){
124
+ llama_context * ctx = mymodel->ctx;
125
+ std::string str2 = str;
126
+ std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
127
+ eval_tokens(mymodel, embd_inp);
128
+ return true;
129
+ }
130
+
131
+ llama_token sampling_id(struct MyModel* mymodel) {
132
+ llama_context* ctx = mymodel->ctx;
133
+ gpt_params params = mymodel->params;
134
+ // int n_ctx = llama_n_ctx(ctx);
135
+
136
+ // out of user input, sample next token
137
+ const float temp = params.temp;
138
+ const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
139
+ const float top_p = params.top_p;
140
+ const float tfs_z = params.tfs_z;
141
+ const float typical_p = params.typical_p;
142
+ // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
143
+ // const float repeat_penalty = params.repeat_penalty;
144
+ // const float alpha_presence = params.presence_penalty;
145
+ // const float alpha_frequency = params.frequency_penalty;
146
+ const int mirostat = params.mirostat;
147
+ const float mirostat_tau = params.mirostat_tau;
148
+ const float mirostat_eta = params.mirostat_eta;
149
+ // const bool penalize_nl = params.penalize_nl;
150
+
151
+ llama_token id = 0;
152
+ {
153
+ auto logits = llama_get_logits(ctx);
154
+ auto n_vocab = llama_n_vocab(ctx);
155
+
156
+ // Apply params.logit_bias map
157
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
158
+ logits[it->first] += it->second;
159
+ }
160
+
161
+ std::vector<llama_token_data> candidates;
162
+ candidates.reserve(n_vocab);
163
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
164
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
165
+ }
166
+
167
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
168
+
169
+ // TODO: Apply penalties
170
+ // float nl_logit = logits[llama_token_nl()];
171
+ // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
172
+ // llama_sample_repetition_penalty(ctx, &candidates_p,
173
+ // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
174
+ // last_n_repeat, repeat_penalty);
175
+ // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
176
+ // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
177
+ // last_n_repeat, alpha_frequency, alpha_presence);
178
+ // if (!penalize_nl) {
179
+ // logits[llama_token_nl()] = nl_logit;
180
+ // }
181
+
182
+ if (temp <= 0) {
183
+ // Greedy sampling
184
+ id = llama_sample_token_greedy(ctx, &candidates_p);
185
+ } else {
186
+ if (mirostat == 1) {
187
+ static float mirostat_mu = 2.0f * mirostat_tau;
188
+ const int mirostat_m = 100;
189
+ llama_sample_temperature(ctx, &candidates_p, temp);
190
+ id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
191
+ } else if (mirostat == 2) {
192
+ static float mirostat_mu = 2.0f * mirostat_tau;
193
+ llama_sample_temperature(ctx, &candidates_p, temp);
194
+ id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
195
+ } else {
196
+ // Temperature sampling
197
+ llama_sample_top_k(ctx, &candidates_p, top_k, 1);
198
+ llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
199
+ llama_sample_typical(ctx, &candidates_p, typical_p, 1);
200
+ llama_sample_top_p(ctx, &candidates_p, top_p, 1);
201
+ llama_sample_temperature(ctx, &candidates_p, temp);
202
+ id = llama_sample_token(ctx, &candidates_p);
203
+ }
204
+ }
205
+ }
206
+
207
+ return id;
208
+ }
209
+
210
+ const char * sampling(struct MyModel * mymodel) {
211
+ llama_context * ctx = mymodel->ctx;
212
+ int id = sampling_id(mymodel);
213
+ std::string ret;
214
+ if (id == llama_token_eos()) ret = "</s>";
215
+ else ret = llama_token_to_str(ctx, id);
216
+ eval_id(mymodel, id);
217
+ return ret.c_str();
218
+ }
219
+
220
+ }
examples/embd-input/embd-input-test.cpp ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "embd-input.h"
2
+ #include <stdlib.h>
3
+ #include <random>
4
+ #include <string.h>
5
+
6
+ int main(int argc, char** argv) {
7
+
8
+ auto mymodel = create_mymodel(argc, argv);
9
+ int N = 10;
10
+ int max_tgt_len = 500;
11
+ int n_embd = llama_n_embd(mymodel->ctx);
12
+
13
+ // add random float embd to test evaluation
14
+ float * data = new float[N*n_embd];
15
+ std::default_random_engine e;
16
+ std::uniform_real_distribution<float> u(0,1);
17
+ for (int i=0;i<N*n_embd;i++) {
18
+ data[i] = u(e);
19
+ }
20
+
21
+ eval_string(mymodel, "user: what is the color of the flag of UN?");
22
+ eval_float(mymodel, data, N);
23
+ eval_string(mymodel, "assistant:");
24
+ eval_string(mymodel, mymodel->params.prompt.c_str());
25
+ const char* tmp;
26
+ for (int i=0; i<max_tgt_len; i++) {
27
+ tmp = sampling(mymodel);
28
+ if (strcmp(tmp, "</s>")==0) break;
29
+ printf("%s", tmp);
30
+ fflush(stdout);
31
+ }
32
+ printf("\n");
33
+ free_mymodel(mymodel);
34
+ return 0;
35
+ }
examples/embd-input/embd-input.h ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef _EMBD_INPUT_H_
2
+ #define _EMBD_INPUT_H_ 1
3
+
4
+ #include "common.h"
5
+ #include "llama.h"
6
+ #include "build-info.h"
7
+
8
+
9
+ extern "C" {
10
+
11
+ typedef struct MyModel {
12
+ llama_context* ctx;
13
+ gpt_params params;
14
+ int n_past = 0;
15
+ } MyModel;
16
+
17
+
18
+ struct MyModel* create_mymodel(int argc, char ** argv);
19
+
20
+ bool eval_float(void* model, float* input, int N);
21
+ bool eval_tokens(void* model, std::vector<llama_token> tokens);
22
+ bool eval_id(struct MyModel* mymodel, int id);
23
+ bool eval_string(struct MyModel* mymodel, const char* str);
24
+ const char* sampling(struct MyModel* mymodel);
25
+ llama_token sampling_id(struct MyModel* mymodel);
26
+ void free_mymodel(struct MyModel* mymodel);
27
+
28
+ }
29
+
30
+ #endif
examples/embd-input/embd_input.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
3
+ import numpy as np
4
+ import os
5
+
6
+ libc = cdll.LoadLibrary("./libembdinput.so")
7
+ libc.sampling.restype=c_char_p
8
+ libc.create_mymodel.restype=c_void_p
9
+ libc.eval_string.argtypes=[c_void_p, c_char_p]
10
+ libc.sampling.argtypes=[c_void_p]
11
+ libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
12
+
13
+
14
+ class MyModel:
15
+ def __init__(self, args):
16
+ argc = len(args)
17
+ c_str = [c_char_p(i.encode()) for i in args]
18
+ args_c = (c_char_p * argc)(*c_str)
19
+ self.model = c_void_p(libc.create_mymodel(argc, args_c))
20
+ self.max_tgt_len = 512
21
+ self.print_string_eval = True
22
+
23
+ def __del__(self):
24
+ libc.free_mymodel(self.model)
25
+
26
+ def eval_float(self, x):
27
+ libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
28
+
29
+ def eval_string(self, x):
30
+ libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
31
+ if self.print_string_eval:
32
+ print(x)
33
+
34
+ def eval_token(self, x):
35
+ libc.eval_id(self.model, x)
36
+
37
+ def sampling(self):
38
+ s = libc.sampling(self.model)
39
+ return s
40
+
41
+ def stream_generate(self, end="</s>"):
42
+ ret = b""
43
+ end = end.encode()
44
+ for _ in range(self.max_tgt_len):
45
+ tmp = self.sampling()
46
+ ret += tmp
47
+ yield tmp
48
+ if ret.endswith(end):
49
+ break
50
+
51
+ def generate_with_print(self, end="</s>"):
52
+ ret = b""
53
+ for i in self.stream_generate(end=end):
54
+ ret += i
55
+ print(i.decode(errors="replace"), end="", flush=True)
56
+ print("")
57
+ return ret.decode(errors="replace")
58
+
59
+
60
+ def generate(self, end="</s>"):
61
+ text = b"".join(self.stream_generate(end=end))
62
+ return text.decode(errors="replace")
63
+
64
+ if __name__ == "__main__":
65
+ model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
66
+ model.eval_string("""user: what is the color of the flag of UN?""")
67
+ x = np.random.random((5120,10))# , dtype=np.float32)
68
+ model.eval_float(x)
69
+ model.eval_string("""assistant:""")
70
+ for i in model.generate():
71
+ print(i.decode(errors="replace"), end="", flush=True)
examples/embd-input/llava.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.insert(0, os.path.dirname(__file__))
4
+ from embd_input import MyModel
5
+ import numpy as np
6
+ from torch import nn
7
+ import torch
8
+ from transformers import CLIPVisionModel, CLIPImageProcessor
9
+ from PIL import Image
10
+
11
+ # model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
12
+ vision_tower = "openai/clip-vit-large-patch14"
13
+ select_hidden_state_layer = -2
14
+ # (vision_config.image_size // vision_config.patch_size) ** 2
15
+ image_token_len = (224//14)**2
16
+
17
+ class Llava:
18
+ def __init__(self, args):
19
+ self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
20
+ self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
21
+ self.mm_projector = nn.Linear(1024, 5120)
22
+ self.model = MyModel(["main", *args])
23
+
24
+ def load_projection(self, path):
25
+ state = torch.load(path)
26
+ self.mm_projector.load_state_dict({
27
+ "weight": state["model.mm_projector.weight"],
28
+ "bias": state["model.mm_projector.bias"]})
29
+
30
+ def chat(self, question):
31
+ self.model.eval_string("user: ")
32
+ self.model.eval_string(question)
33
+ self.model.eval_string("\nassistant: ")
34
+ return self.model.generate_with_print()
35
+
36
+ def chat_with_image(self, image, question):
37
+ with torch.no_grad():
38
+ embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
39
+ image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
40
+ select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
41
+ image_feature = select_hidden_state[:, 1:]
42
+ embd_image = self.mm_projector(image_feature)
43
+ embd_image = embd_image.cpu().numpy()[0]
44
+ self.model.eval_string("user: ")
45
+ self.model.eval_token(32003-2) # im_start
46
+ self.model.eval_float(embd_image.T)
47
+ for i in range(image_token_len-embd_image.shape[0]):
48
+ self.model.eval_token(32003-3) # im_patch
49
+ self.model.eval_token(32003-1) # im_end
50
+ self.model.eval_string(question)
51
+ self.model.eval_string("\nassistant: ")
52
+ return self.model.generate_with_print()
53
+
54
+
55
+ if __name__=="__main__":
56
+ # model form liuhaotian/LLaVA-13b-delta-v1-1
57
+ a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
58
+ # Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
59
+ # Also here can use pytorch_model-00003-of-00003.bin directly.
60
+ a.load_projection(os.path.join(
61
+ os.path.dirname(__file__) ,
62
+ "llava_projetion.pth"))
63
+ respose = a.chat_with_image(
64
+ Image.open("./media/llama1-logo.png").convert('RGB'),
65
+ "what is the text in the picture?")
66
+ respose
67
+ a.chat("what is the color of it?")
68
+
69
+
70
+
examples/embd-input/minigpt4.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.insert(0, os.path.dirname(__file__))
4
+ from embd_input import MyModel
5
+ import numpy as np
6
+ from torch import nn
7
+ import torch
8
+ from PIL import Image
9
+
10
+ minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
11
+ sys.path.insert(0, minigpt4_path)
12
+ from minigpt4.models.blip2 import Blip2Base
13
+ from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
14
+
15
+
16
+ class MiniGPT4(Blip2Base):
17
+ """
18
+ MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
19
+ """
20
+ def __init__(self,
21
+ args,
22
+ vit_model="eva_clip_g",
23
+ q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
24
+ img_size=224,
25
+ drop_path_rate=0,
26
+ use_grad_checkpoint=False,
27
+ vit_precision="fp32",
28
+ freeze_vit=True,
29
+ freeze_qformer=True,
30
+ num_query_token=32,
31
+ llama_model="",
32
+ prompt_path="",
33
+ prompt_template="",
34
+ max_txt_len=32,
35
+ end_sym='\n',
36
+ low_resource=False, # use 8 bit and put vit in cpu
37
+ device_8bit=0
38
+ ):
39
+ super().__init__()
40
+ self.img_size = img_size
41
+ self.low_resource = low_resource
42
+ self.preprocessor = Blip2ImageEvalProcessor(img_size)
43
+
44
+ print('Loading VIT')
45
+ self.visual_encoder, self.ln_vision = self.init_vision_encoder(
46
+ vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
47
+ )
48
+ print('Loading VIT Done')
49
+ print('Loading Q-Former')
50
+ self.Qformer, self.query_tokens = self.init_Qformer(
51
+ num_query_token, self.visual_encoder.num_features
52
+ )
53
+ self.Qformer.cls = None
54
+ self.Qformer.bert.embeddings.word_embeddings = None
55
+ self.Qformer.bert.embeddings.position_embeddings = None
56
+ for layer in self.Qformer.bert.encoder.layer:
57
+ layer.output = None
58
+ layer.intermediate = None
59
+ self.load_from_pretrained(url_or_filename=q_former_model)
60
+ print('Loading Q-Former Done')
61
+ self.llama_proj = nn.Linear(
62
+ self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
63
+ )
64
+ self.max_txt_len = max_txt_len
65
+ self.end_sym = end_sym
66
+ self.model = MyModel(["main", *args])
67
+ # system promt
68
+ self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
69
+ "You will be able to see the image once I provide it to you. Please answer my questions."
70
+ "###")
71
+
72
+ def encode_img(self, image):
73
+ image = self.preprocessor(image)
74
+ image = image.unsqueeze(0)
75
+ device = image.device
76
+ if self.low_resource:
77
+ self.vit_to_cpu()
78
+ image = image.to("cpu")
79
+
80
+ with self.maybe_autocast():
81
+ image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
82
+ image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
83
+
84
+ query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
85
+ query_output = self.Qformer.bert(
86
+ query_embeds=query_tokens,
87
+ encoder_hidden_states=image_embeds,
88
+ encoder_attention_mask=image_atts,
89
+ return_dict=True,
90
+ )
91
+
92
+ inputs_llama = self.llama_proj(query_output.last_hidden_state)
93
+ # atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
94
+ return inputs_llama
95
+
96
+ def load_projection(self, path):
97
+ state = torch.load(path)["model"]
98
+ self.llama_proj.load_state_dict({
99
+ "weight": state["llama_proj.weight"],
100
+ "bias": state["llama_proj.bias"]})
101
+
102
+ def chat(self, question):
103
+ self.model.eval_string("Human: ")
104
+ self.model.eval_string(question)
105
+ self.model.eval_string("\n### Assistant:")
106
+ return self.model.generate_with_print(end="###")
107
+
108
+ def chat_with_image(self, image, question):
109
+ with torch.no_grad():
110
+ embd_image = self.encode_img(image)
111
+ embd_image = embd_image.cpu().numpy()[0]
112
+ self.model.eval_string("Human: <Img>")
113
+ self.model.eval_float(embd_image.T)
114
+ self.model.eval_string("</Img> ")
115
+ self.model.eval_string(question)
116
+ self.model.eval_string("\n### Assistant:")
117
+ return self.model.generate_with_print(end="###")
118
+
119
+
120
+ if __name__=="__main__":
121
+ a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
122
+ a.load_projection(os.path.join(
123
+ os.path.dirname(__file__) ,
124
+ "pretrained_minigpt4.pth"))
125
+ respose = a.chat_with_image(
126
+ Image.open("./media/llama1-logo.png").convert('RGB'),
127
+ "what is the text in the picture?")
128
+ a.chat("what is the color of it?")
examples/embd-input/panda_gpt.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.insert(0, os.path.dirname(__file__))
4
+ from embd_input import MyModel
5
+ import numpy as np
6
+ from torch import nn
7
+ import torch
8
+
9
+ # use PandaGPT path
10
+ panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
11
+ imagebind_ckpt_path = "./models/panda_gpt/"
12
+
13
+ sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
14
+ from ImageBind.models import imagebind_model
15
+ from ImageBind import data
16
+
17
+ ModalityType = imagebind_model.ModalityType
18
+ max_tgt_len = 400
19
+
20
+ class PandaGPT:
21
+ def __init__(self, args):
22
+ self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
23
+ self.visual_encoder.eval()
24
+ self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
25
+ self.max_tgt_len = max_tgt_len
26
+ self.model = MyModel(["main", *args])
27
+ self.generated_text = ""
28
+ self.device = "cpu"
29
+
30
+ def load_projection(self, path):
31
+ state = torch.load(path, map_location="cpu")
32
+ self.llama_proj.load_state_dict({
33
+ "weight": state["llama_proj.weight"],
34
+ "bias": state["llama_proj.bias"]})
35
+
36
+ def eval_inputs(self, inputs):
37
+ self.model.eval_string("<Img>")
38
+ embds = self.extract_multimoal_feature(inputs)
39
+ for i in embds:
40
+ self.model.eval_float(i.T)
41
+ self.model.eval_string("</Img> ")
42
+
43
+ def chat(self, question):
44
+ return self.chat_with_image(None, question)
45
+
46
+ def chat_with_image(self, inputs, question):
47
+ if self.generated_text == "":
48
+ self.model.eval_string("###")
49
+ self.model.eval_string(" Human: ")
50
+ if inputs:
51
+ self.eval_inputs(inputs)
52
+ self.model.eval_string(question)
53
+ self.model.eval_string("\n### Assistant:")
54
+ ret = self.model.generate_with_print(end="###")
55
+ self.generated_text += ret
56
+ return ret
57
+
58
+ def extract_multimoal_feature(self, inputs):
59
+ features = []
60
+ for key in ["image", "audio", "video", "thermal"]:
61
+ if key + "_paths" in inputs:
62
+ embeds = self.encode_data(key, inputs[key+"_paths"])
63
+ features.append(embeds)
64
+ return features
65
+
66
+ def encode_data(self, data_type, data_paths):
67
+
68
+ type_map = {
69
+ "image": ModalityType.VISION,
70
+ "audio": ModalityType.AUDIO,
71
+ "video": ModalityType.VISION,
72
+ "thermal": ModalityType.THERMAL,
73
+ }
74
+ load_map = {
75
+ "image": data.load_and_transform_vision_data,
76
+ "audio": data.load_and_transform_audio_data,
77
+ "video": data.load_and_transform_video_data,
78
+ "thermal": data.load_and_transform_thermal_data
79
+ }
80
+
81
+ load_function = load_map[data_type]
82
+ key = type_map[data_type]
83
+
84
+ inputs = {key: load_function(data_paths, self.device)}
85
+ with torch.no_grad():
86
+ embeddings = self.visual_encoder(inputs)
87
+ embeds = embeddings[key]
88
+ embeds = self.llama_proj(embeds).cpu().numpy()
89
+ return embeds
90
+
91
+
92
+ if __name__=="__main__":
93
+ a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
94
+ a.load_projection("./models/panda_gpt/adapter_model.bin")
95
+ a.chat_with_image(
96
+ {"image_paths": ["./media/llama1-logo.png"]},
97
+ "what is the text in the picture? 'llama' or 'lambda'?")
98
+ a.chat("what is the color of it?")
examples/embedding/embedding.cpp CHANGED
@@ -24,18 +24,18 @@ int main(int argc, char ** argv) {
24
 
25
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
26
 
27
- if (params.seed < 0) {
28
  params.seed = time(NULL);
29
  }
30
 
31
- fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
32
 
33
  std::mt19937 rng(params.seed);
34
  if (params.random_prompt) {
35
  params.prompt = gpt_random_prompt(rng);
36
  }
37
 
38
- llama_init_backend();
39
 
40
  llama_model * model;
41
  llama_context * ctx;
 
24
 
25
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
26
 
27
+ if (params.seed == LLAMA_DEFAULT_SEED) {
28
  params.seed = time(NULL);
29
  }
30
 
31
+ fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
32
 
33
  std::mt19937 rng(params.seed);
34
  if (params.random_prompt) {
35
  params.prompt = gpt_random_prompt(rng);
36
  }
37
 
38
+ llama_init_backend(params.numa);
39
 
40
  llama_model * model;
41
  llama_context * ctx;
examples/main/README.md CHANGED
@@ -242,7 +242,7 @@ Example usage: `--logit-bias 29905-inf`
242
 
243
  ### RNG Seed
244
 
245
- - `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
246
 
247
  The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
248
 
@@ -262,6 +262,10 @@ These options help improve the performance and memory usage of the LLaMA models.
262
 
263
  - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
264
 
 
 
 
 
265
  ### Memory Float 32
266
 
267
  - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
 
242
 
243
  ### RNG Seed
244
 
245
+ - `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
246
 
247
  The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
248
 
 
262
 
263
  - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
264
 
265
+ ### NUMA support
266
+
267
+ - `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
268
+
269
  ### Memory Float 32
270
 
271
  - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
examples/main/main.cpp CHANGED
@@ -94,18 +94,18 @@ int main(int argc, char ** argv) {
94
 
95
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
96
 
97
- if (params.seed < 0) {
98
  params.seed = time(NULL);
99
  }
100
 
101
- fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
102
 
103
  std::mt19937 rng(params.seed);
104
  if (params.random_prompt) {
105
  params.prompt = gpt_random_prompt(rng);
106
  }
107
 
108
- llama_init_backend();
109
 
110
  llama_model * model;
111
  llama_context * ctx;
 
94
 
95
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
96
 
97
+ if (params.seed == LLAMA_DEFAULT_SEED) {
98
  params.seed = time(NULL);
99
  }
100
 
101
+ fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
102
 
103
  std::mt19937 rng(params.seed);
104
  if (params.random_prompt) {
105
  params.prompt = gpt_random_prompt(rng);
106
  }
107
 
108
+ llama_init_backend(params.numa);
109
 
110
  llama_model * model;
111
  llama_context * ctx;
examples/perplexity/perplexity.cpp CHANGED
@@ -136,18 +136,18 @@ int main(int argc, char ** argv) {
136
 
137
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
138
 
139
- if (params.seed < 0) {
140
  params.seed = time(NULL);
141
  }
142
 
143
- fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
144
 
145
  std::mt19937 rng(params.seed);
146
  if (params.random_prompt) {
147
  params.prompt = gpt_random_prompt(rng);
148
  }
149
 
150
- llama_init_backend();
151
 
152
  llama_model * model;
153
  llama_context * ctx;
 
136
 
137
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
138
 
139
+ if (params.seed == LLAMA_DEFAULT_SEED) {
140
  params.seed = time(NULL);
141
  }
142
 
143
+ fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
144
 
145
  std::mt19937 rng(params.seed);
146
  if (params.random_prompt) {
147
  params.prompt = gpt_random_prompt(rng);
148
  }
149
 
150
+ llama_init_backend(params.numa);
151
 
152
  llama_model * model;
153
  llama_context * ctx;
examples/quantize/quantize.cpp CHANGED
@@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
178
  usage(argv[0]);
179
  }
180
 
181
- llama_init_backend();
182
 
183
  // parse command line arguments
184
  const std::string fname_inp = argv[arg_idx];
 
178
  usage(argv[0]);
179
  }
180
 
181
+ llama_init_backend(false);
182
 
183
  // parse command line arguments
184
  const std::string fname_inp = argv[arg_idx];
examples/server/README.md CHANGED
@@ -152,7 +152,7 @@ node .
152
 
153
  `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
154
 
155
- `seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
156
 
157
  `ignore_eos`: Ignore end of stream token and continue generating (default: false).
158
 
 
152
 
153
  `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
154
 
155
+ `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
156
 
157
  `ignore_eos`: Ignore end of stream token and continue generating (default: false).
158
 
examples/server/server.cpp CHANGED
@@ -325,10 +325,10 @@ struct llama_server_context {
325
  id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
326
  } else {
327
  // Temperature sampling
 
328
  llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
329
  llama_sample_typical(ctx, &candidates_p, typical_p, 1);
330
  llama_sample_top_p(ctx, &candidates_p, top_p, 1);
331
- llama_sample_top_k(ctx, &candidates_p, top_k, 1);
332
  llama_sample_temperature(ctx, &candidates_p, temp);
333
  id = llama_sample_token(ctx, &candidates_p);
334
  }
@@ -789,7 +789,7 @@ int main(int argc, char ** argv) {
789
  params.model_alias = params.model;
790
  }
791
 
792
- llama_init_backend();
793
 
794
  LOG_INFO("build info", {
795
  { "build", BUILD_NUMBER },
 
325
  id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
326
  } else {
327
  // Temperature sampling
328
+ llama_sample_top_k(ctx, &candidates_p, top_k, 1);
329
  llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
330
  llama_sample_typical(ctx, &candidates_p, typical_p, 1);
331
  llama_sample_top_p(ctx, &candidates_p, top_p, 1);
 
332
  llama_sample_temperature(ctx, &candidates_p, temp);
333
  id = llama_sample_token(ctx, &candidates_p);
334
  }
 
789
  params.model_alias = params.model;
790
  }
791
 
792
+ llama_init_backend(params.numa);
793
 
794
  LOG_INFO("build info", {
795
  { "build", BUILD_NUMBER },
examples/simple/simple.cpp CHANGED
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
66
  // Init LLM :
67
  //---------------------------------
68
 
69
- llama_init_backend();
70
 
71
  llama_model * model;
72
  llama_context * ctx;
 
66
  // Init LLM :
67
  //---------------------------------
68
 
69
+ llama_init_backend(params.numa);
70
 
71
  llama_model * model;
72
  llama_context * ctx;
examples/train-text-from-scratch/train-text-from-scratch.cpp CHANGED
@@ -294,20 +294,9 @@ void init_model(struct my_llama_model * model) {
294
 
295
  ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
296
 
297
- // 'layers.10.feed_forward.w1.weight' has length of 32.
298
- // ggml_tensor->name only has 32 characters, but we need one more for the '\0' terminator.
299
- // ggml_set_name will set the last character to '\0', so we can only store 'layers.10.feed_forward.w1.weigh'.
300
- // when saving llama compatible model the tensors names will miss a character.
301
- // ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
302
- // ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
303
- // ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
304
-
305
- strncpy(layer.w1->name, (layers_i + ".feed_forward.w1.weight").c_str(), sizeof(layer.w1->name));
306
- strncpy(layer.w2->name, (layers_i + ".feed_forward.w2.weight").c_str(), sizeof(layer.w2->name));
307
- strncpy(layer.w3->name, (layers_i + ".feed_forward.w3.weight").c_str(), sizeof(layer.w3->name));
308
- layer.w1->padding[0] = 0;
309
- layer.w2->padding[0] = 0;
310
- layer.w3->padding[0] = 0;
311
  }
312
  }
313
 
@@ -454,8 +443,8 @@ struct ggml_tensor * forward(
454
  // wk shape [n_embd, n_embd, 1, 1]
455
  // Qcur shape [n_embd/n_head, n_head, N, 1]
456
  // Kcur shape [n_embd/n_head, n_head, N, 1]
457
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
458
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
459
 
460
  // store key and value to memory
461
  {
@@ -711,8 +700,8 @@ struct ggml_tensor * forward_batch(
711
  // wk shape [n_embd, n_embd, 1, 1]
712
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
713
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
714
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
715
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
716
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
717
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
718
 
@@ -996,8 +985,8 @@ struct ggml_tensor * forward_batch_wo_cache(
996
  // wk shape [n_embd, n_embd, 1, 1]
997
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
998
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
999
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
1000
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
1001
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
1002
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
1003
 
@@ -1218,8 +1207,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
1218
  // compute Q and K and RoPE them
1219
  // wq shape [n_embd, n_embd, 1, 1]
1220
  // wk shape [n_embd, n_embd, 1, 1]
1221
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
1222
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
1223
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
1224
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
1225
 
@@ -1618,10 +1607,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
1618
  use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
1619
  use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
1620
  use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
1621
- use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
1622
  use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch);
1623
  use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
1624
- use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
1625
  use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd);
1626
  use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
1627
  use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -2368,7 +2357,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
2368
  file->write_u32(0);
2369
  file->write_u32(0);
2370
  file->write_u32(GGML_TYPE_F32);
2371
- file->seek(0-file->tell() & 31, SEEK_CUR);
2372
  return;
2373
  }
2374
  const char * name = ggml_get_name(tensor);
@@ -2383,7 +2372,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
2383
  file->write_u32(tensor->type);
2384
  file->write_raw(ne, sizeof(ne[0]) * nd);
2385
  file->write_raw(name, name_len);
2386
- file->seek(0-file->tell() & 31, SEEK_CUR);
2387
  file->write_raw(tensor->data, ggml_nbytes(tensor));
2388
  }
2389
 
@@ -2404,7 +2393,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
2404
  std::string name = file->read_string(name_len);
2405
  GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
2406
 
2407
- file->seek(0-file->tell() & 31, SEEK_CUR);
2408
  file->read_raw(tensor->data, ggml_nbytes(tensor));
2409
  }
2410
 
@@ -2779,7 +2768,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
2779
  fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
2780
  fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
2781
  fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out);
2782
- fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
2783
  fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx);
2784
  fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd);
2785
  fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
@@ -3045,10 +3034,10 @@ int main(int argc, char ** argv) {
3045
  return 1;
3046
  }
3047
 
3048
- if (params.seed < 0) {
3049
  params.seed = time(NULL);
3050
  }
3051
- printf("%s: seed: %d\n", __func__, params.seed);
3052
  srand(params.seed);
3053
 
3054
  struct llama_context_params llama_params = llama_context_default_params();
 
294
 
295
  ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
296
 
297
+ ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
298
+ ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
299
+ ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
 
 
 
 
 
 
 
 
 
 
 
300
  }
301
  }
302
 
 
443
  // wk shape [n_embd, n_embd, 1, 1]
444
  // Qcur shape [n_embd/n_head, n_head, N, 1]
445
  // Kcur shape [n_embd/n_head, n_head, N, 1]
446
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
447
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
448
 
449
  // store key and value to memory
450
  {
 
700
  // wk shape [n_embd, n_embd, 1, 1]
701
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
702
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
703
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
704
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
705
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
706
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
707
 
 
985
  // wk shape [n_embd, n_embd, 1, 1]
986
  // Qcur shape [n_embd/n_head, n_head, N, n_batch]
987
  // Kcur shape [n_embd/n_head, n_head, N, n_batch]
988
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
989
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
990
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
991
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
992
 
 
1207
  // compute Q and K and RoPE them
1208
  // wq shape [n_embd, n_embd, 1, 1]
1209
  // wk shape [n_embd, n_embd, 1, 1]
1210
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
1211
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
1212
  assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
1213
  assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
1214
 
 
1607
  use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
1608
  use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
1609
  use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
1610
+ use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
1611
  use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch);
1612
  use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
1613
+ use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
1614
  use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd);
1615
  use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
1616
  use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
 
2357
  file->write_u32(0);
2358
  file->write_u32(0);
2359
  file->write_u32(GGML_TYPE_F32);
2360
+ file->seek((0-file->tell()) & 31, SEEK_CUR);
2361
  return;
2362
  }
2363
  const char * name = ggml_get_name(tensor);
 
2372
  file->write_u32(tensor->type);
2373
  file->write_raw(ne, sizeof(ne[0]) * nd);
2374
  file->write_raw(name, name_len);
2375
+ file->seek((0-file->tell()) & 31, SEEK_CUR);
2376
  file->write_raw(tensor->data, ggml_nbytes(tensor));
2377
  }
2378
 
 
2393
  std::string name = file->read_string(name_len);
2394
  GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
2395
 
2396
+ file->seek((0-file->tell()) & 31, SEEK_CUR);
2397
  file->read_raw(tensor->data, ggml_nbytes(tensor));
2398
  }
2399
 
 
2768
  fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
2769
  fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
2770
  fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out);
2771
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n");
2772
  fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx);
2773
  fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd);
2774
  fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
 
3034
  return 1;
3035
  }
3036
 
3037
+ if (params.seed == LLAMA_DEFAULT_SEED) {
3038
  params.seed = time(NULL);
3039
  }
3040
+ printf("%s: seed: %u\n", __func__, params.seed);
3041
  srand(params.seed);
3042
 
3043
  struct llama_context_params llama_params = llama_context_default_params();
expose.h CHANGED
@@ -8,6 +8,7 @@ struct load_model_inputs
8
  const int max_context_length;
9
  const int batch_size;
10
  const bool f16_kv;
 
11
  const char * executable_path;
12
  const char * model_filename;
13
  const char * lora_filename;
 
8
  const int max_context_length;
9
  const int batch_size;
10
  const bool f16_kv;
11
+ const bool low_vram;
12
  const char * executable_path;
13
  const char * model_filename;
14
  const char * lora_filename;
ggml-cuda.cu CHANGED
@@ -117,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
117
 
118
  //================================= k-quants
119
 
 
 
 
 
120
  #define QK_K 256
 
 
121
 
122
  typedef struct {
123
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
@@ -128,13 +134,25 @@ typedef struct {
128
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
129
 
130
  typedef struct {
131
- uint8_t hmask[QK_K/8];
132
- uint8_t qs[QK_K/4]; // nibbles / quants
133
- uint8_t scales[3*QK_K/64];
134
- half d;
 
 
 
 
135
  } block_q3_K;
136
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
137
 
 
 
 
 
 
 
 
 
138
  typedef struct {
139
  half d; // super-block scale for quantized scales
140
  half dmin; // super-block scale for quantized mins
@@ -142,15 +160,26 @@ typedef struct {
142
  uint8_t qs[QK_K/2]; // 4--bit quants
143
  } block_q4_K;
144
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
 
145
 
 
 
 
 
 
 
 
 
 
146
  typedef struct {
147
- half d; // super-block scale for quantized scales
148
- half dmin; // super-block scale for quantized mins
149
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
150
  uint8_t qh[QK_K/8]; // quants, high bit
151
  uint8_t qs[QK_K/2]; // quants, low 4 bits
152
  } block_q5_K;
153
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 
154
 
155
  typedef struct {
156
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
@@ -194,6 +223,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
194
  dst[i] = x[i] + y[i];
195
  }
196
 
 
 
 
 
 
 
 
 
 
197
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
198
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
199
 
@@ -349,13 +387,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
349
  static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
350
 
351
  const int i = blockIdx.x;
 
 
352
  const int tid = threadIdx.x;
 
353
  const int n = tid/32;
354
  const int l = tid - 32*n;
355
  const int is = 8*n + l/16;
356
 
357
- const block_q2_K * x = (const block_q2_K *) vx;
358
-
359
  const uint8_t q = x[i].qs[32*n + l];
360
  float * y = yy + i*QK_K + 128*n;
361
 
@@ -365,21 +404,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
365
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
366
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
367
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
 
 
 
 
 
 
 
 
 
 
368
 
369
  }
370
 
371
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
372
 
373
- int r = threadIdx.x/4;
374
- int i = blockIdx.x;
375
- int tid = r/2;
376
- int is0 = r%2;
377
- int l0 = 16*is0 + 4*(threadIdx.x%4);
378
- int n = tid / 4;
379
- int j = tid - 4*n;
380
-
381
  const block_q3_K * x = (const block_q3_K *) vx;
382
 
 
 
 
 
 
 
 
 
383
  uint8_t m = 1 << (4*n + j);
384
  int is = 8*n + 2*j + is0;
385
  int shift = 2*j;
@@ -396,9 +446,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
396
  const uint8_t * hm = x[i].hmask;
397
 
398
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
  }
401
 
 
402
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
403
  if (j < 4) {
404
  d = q[j] & 63; m = q[j + 4] & 63;
@@ -407,19 +479,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
407
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
408
  }
409
  }
 
410
 
411
  static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
412
  const block_q4_K * x = (const block_q4_K *) vx;
413
 
414
  const int i = blockIdx.x;
415
 
416
- //// assume 64 threads - this is very slightly better than the one below
417
- //const int tid = threadIdx.x;
418
- //const int il = tid/16;
419
- //const int ir = tid%16;
420
- //const int is = 2*il;
421
- //const int n = 2;
422
-
423
  // assume 32 threads
424
  const int tid = threadIdx.x;
425
  const int il = tid/8;
@@ -443,6 +510,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
443
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
444
  y[l +32] = d2 * (q[l] >> 4) - m2;
445
  }
 
 
 
 
 
 
 
 
 
446
  }
447
 
448
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
@@ -450,6 +526,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
450
 
451
  const int i = blockIdx.x;
452
 
 
453
  // assume 64 threads - this is very slightly better than the one below
454
  const int tid = threadIdx.x;
455
  const int il = tid/16; // il is in 0...3
@@ -476,12 +553,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
476
  hm <<= 1;
477
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
478
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
 
 
 
 
 
 
 
 
 
 
 
 
479
  }
480
 
481
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
482
  const block_q6_K * x = (const block_q6_K *) vx;
483
 
484
  const int i = blockIdx.x;
 
485
 
486
  // assume 64 threads - this is very slightly better than the one below
487
  const int tid = threadIdx.x;
@@ -501,6 +591,24 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
501
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
502
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
503
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  }
505
 
506
  static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
@@ -515,6 +623,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
515
 
516
  const block_q2_K * x = (const block_q2_K *)vx + ib0;
517
 
 
 
 
518
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
519
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
520
 
@@ -528,8 +639,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
528
  const int s_offset = 8*im;
529
  const int y_offset = 128*im + l0;
530
 
531
- float tmp = 0; // partial sum for thread in warp
532
-
533
  uint32_t aux[4];
534
  const uint8_t * d = (const uint8_t *)aux;
535
  const uint8_t * m = (const uint8_t *)(aux + 2);
@@ -565,6 +674,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
565
  tmp += dall * sum1 - dmin * sum2;
566
 
567
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
 
569
  // sum up partial sums and write back result
570
  __syncthreads();
@@ -573,16 +715,13 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
573
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
574
  }
575
 
576
- if (tid == 0) {
577
  dst[row] = tmp;
578
  }
579
  }
580
 
581
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
582
 
583
- const uint16_t kmask1 = 0x0303;
584
- const uint16_t kmask2 = 0x0f0f;
585
-
586
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
587
  if (row > nrows) return;
588
 
@@ -591,6 +730,13 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
591
 
592
  const block_q3_K * x = (const block_q3_K *)vx + ib0;
593
 
 
 
 
 
 
 
 
594
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
595
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
596
 
@@ -610,8 +756,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
610
 
611
  const uint16_t s_shift = 4*im;
612
 
613
- float tmp = 0; // partial sum for thread in warp
614
-
615
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
 
617
  const float * y = yy + i * QK_K + y_offset;
@@ -640,6 +784,34 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
640
  tmp += d * sum;
641
 
642
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
  // sum up partial sums and write back result
645
  __syncthreads();
@@ -648,22 +820,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
648
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
649
  }
650
 
651
- if (tid == 0) {
652
  dst[row] = tmp;
653
  }
654
  }
655
 
656
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
657
 
658
- const uint16_t kmask1 = 0x3f3f;
659
- const uint16_t kmask2 = 0x0f0f;
660
- const uint16_t kmask3 = 0xc0c0;
661
-
662
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
663
  if (row > nrows) return;
664
  const int num_blocks_per_row = ncols / QK_K;
665
  const int ib0 = row*num_blocks_per_row;
666
 
 
 
 
 
 
 
 
667
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
668
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
669
 
@@ -683,8 +858,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
683
  uint16_t aux[4];
684
  const uint8_t * sc = (const uint8_t *)aux;
685
 
686
- const block_q4_K * x = (const block_q4_K *)vx + ib0;
687
-
688
  float tmp = 0; // partial sum for thread in warp
689
 
690
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
@@ -713,6 +886,36 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
713
  tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
714
 
715
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
 
717
  // sum up partial sums and write back result
718
  __syncthreads();
@@ -728,15 +931,19 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
728
 
729
  static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
730
 
731
- const uint16_t kmask1 = 0x3f3f;
732
- const uint16_t kmask2 = 0x0f0f;
733
- const uint16_t kmask3 = 0xc0c0;
734
-
735
- //const int row = blockIdx.x*blockDim.y + threadIdx.y;
736
  const int row = blockIdx.x;
737
  const int num_blocks_per_row = ncols / QK_K;
738
  const int ib0 = row*num_blocks_per_row;
739
 
 
 
 
 
 
 
 
 
 
740
  const int tid = threadIdx.x/2; // 0...15
741
  const int ix = threadIdx.x%2;
742
 
@@ -757,10 +964,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
757
  uint16_t aux[4];
758
  const uint8_t * sc = (const uint8_t *)aux;
759
 
760
- const block_q5_K * x = (const block_q5_K *)vx + ib0;
761
-
762
- float tmp = 0; // partial sum for thread in warp
763
-
764
  for (int i = ix; i < num_blocks_per_row; i += 2) {
765
 
766
  const uint8_t * ql1 = x[i].qs + q_offset;
@@ -793,8 +996,31 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
793
  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
794
  }
795
  tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
 
796
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
797
  }
 
798
 
799
  // sum up partial sums and write back result
800
  __syncthreads();
@@ -803,7 +1029,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
803
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
804
  }
805
 
806
- if (tid == 0) {
807
  dst[row] = tmp;
808
  }
809
  }
@@ -820,6 +1046,8 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
820
 
821
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
822
 
 
 
823
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
824
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
825
 
@@ -874,6 +1102,37 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
874
 
875
  }
876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877
  // sum up partial sums and write back result
878
  __syncthreads();
879
  #pragma unroll
@@ -985,7 +1244,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
985
  }
986
 
987
  static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
988
- const half * x = (half *) vx;
989
 
990
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
991
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1033,9 +1292,9 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
1033
 
1034
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1035
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1036
- const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
1037
 
1038
- const half * x = (half *) vx;
1039
 
1040
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1041
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
@@ -1078,14 +1337,14 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1078
  }
1079
 
1080
  static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1081
- const float * xi = (float *) cxi;
1082
  float * dsti = (float *) cdsti;
1083
 
1084
  *dsti = *xi;
1085
  }
1086
 
1087
  static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1088
- const float * xi = (float *) cxi;
1089
  half * dsti = (half *) cdsti;
1090
 
1091
  *dsti = __float2half(*xi);
@@ -1209,6 +1468,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
1209
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1210
  }
1211
 
 
 
 
 
 
1212
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
1213
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
1214
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -1252,12 +1516,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
1252
 
1253
  static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1254
  const int nb = k / QK_K;
 
1255
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
 
 
 
1256
  }
1257
 
1258
  static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1259
  const int nb = k / QK_K;
 
1260
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
 
 
 
1261
  }
1262
 
1263
  static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -1267,12 +1539,20 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu
1267
 
1268
  static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1269
  const int nb = k / QK_K;
 
1270
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
 
 
 
1271
  }
1272
 
1273
  static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1274
  const int nb = k / QK_K;
 
1275
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
 
 
 
1276
  }
1277
 
1278
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -1418,7 +1698,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
1418
  const dim3 block_nums(1, nrows_x, nchannels_x);
1419
  const dim3 block_dims(WARP_SIZE, 1, 1);
1420
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1421
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x);
1422
  }
1423
 
1424
  static void ggml_cpy_f32_f32_cuda(
@@ -1497,15 +1777,40 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
1497
  int id;
1498
  CUDA_CHECK(cudaGetDevice(&id));
1499
 
 
 
 
 
 
1500
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
1501
  cuda_buffer& b = g_cuda_buffer_pool[id][i];
1502
- if (b.size >= size && b.ptr != nullptr) {
1503
- void * ptr = b.ptr;
1504
- *actual_size = b.size;
1505
- b.ptr = nullptr;
1506
- b.size = 0;
1507
- return ptr;
1508
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1509
  }
1510
  void * ptr;
1511
  CUDA_CHECK(cudaMalloc((void **) &ptr, size));
@@ -1675,7 +1980,7 @@ inline void ggml_cuda_op_add(
1675
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1676
  cudaStream_t & cudaStream_main){
1677
 
1678
- GGML_ASSERT(src0_ddf_i != nullptr);
1679
  GGML_ASSERT(src1_ddf_i != nullptr);
1680
  GGML_ASSERT(dst_ddf_i != nullptr);
1681
 
@@ -1683,7 +1988,13 @@ inline void ggml_cuda_op_add(
1683
  const int64_t i01_diff = i01_high - i01_low;
1684
 
1685
  // compute
1686
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
 
 
 
 
 
 
1687
  CUDA_CHECK(cudaGetLastError());
1688
 
1689
  (void) src1;
@@ -1909,10 +2220,13 @@ inline void ggml_cuda_op_rope(
1909
  const int n_past = ((int32_t *) src1->data)[0];
1910
  const int n_dims = ((int32_t *) src1->data)[1];
1911
  const int mode = ((int32_t *) src1->data)[2];
 
1912
  GGML_ASSERT(mode == 0);
1913
 
1914
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
1915
- const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
 
 
1916
 
1917
  // compute
1918
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
@@ -2281,8 +2595,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2281
  }
2282
 
2283
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2284
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
2285
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
 
 
 
 
 
 
2286
  }
2287
 
2288
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2535,7 +2855,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
2535
  delete extra;
2536
  }
2537
 
2538
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2539
  if (scratch && g_scratch_size == 0) {
2540
  return;
2541
  }
@@ -2544,22 +2864,24 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2544
  if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2545
  const ggml_op src0_op = tensor->src0->op;
2546
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2547
- ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
2548
  }
2549
  }
2550
  if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2551
- ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
2552
  }
2553
 
2554
  tensor->backend = GGML_BACKEND_GPU;
2555
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
 
2556
 
2557
  const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2558
- tensor->op == GGML_OP_VIEW;
 
2559
  const size_t size = ggml_nbytes(tensor);
2560
 
2561
  CUDA_CHECK(cudaSetDevice(g_main_device));
2562
- if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
2563
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2564
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2565
  size_t offset = 0;
@@ -2598,11 +2920,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
2598
  }
2599
 
2600
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2601
- ggml_cuda_assign_buffers_impl(tensor, true);
2602
  }
2603
 
2604
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2605
- ggml_cuda_assign_buffers_impl(tensor, false);
 
 
 
 
2606
  }
2607
 
2608
  void ggml_cuda_set_main_device(int main_device) {
 
117
 
118
  //================================= k-quants
119
 
120
+ #ifdef GGML_QKK_64
121
+ #define QK_K 64
122
+ #define K_SCALE_SIZE 4
123
+ #else
124
  #define QK_K 256
125
+ #define K_SCALE_SIZE 12
126
+ #endif
127
 
128
  typedef struct {
129
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
 
134
  static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
135
 
136
  typedef struct {
137
+ uint8_t hmask[QK_K/8]; // quants - high bit
138
+ uint8_t qs[QK_K/4]; // quants - low 2 bits
139
+ #ifdef GGML_QKK_64
140
+ uint8_t scales[2]; // scales, quantized with 8 bits
141
+ #else
142
+ uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
143
+ #endif
144
+ half d; // super-block scale
145
  } block_q3_K;
146
+ //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
147
 
148
+ #ifdef GGML_QKK_64
149
+ typedef struct {
150
+ half d[2]; // super-block scales/mins
151
+ uint8_t scales[2]; // 4-bit block scales/mins
152
+ uint8_t qs[QK_K/2]; // 4--bit quants
153
+ } block_q4_K;
154
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
155
+ #else
156
  typedef struct {
157
  half d; // super-block scale for quantized scales
158
  half dmin; // super-block scale for quantized mins
 
160
  uint8_t qs[QK_K/2]; // 4--bit quants
161
  } block_q4_K;
162
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
163
+ #endif
164
 
165
+ #ifdef GGML_QKK_64
166
+ typedef struct {
167
+ half d; // super-block scale
168
+ int8_t scales[QK_K/16]; // block scales
169
+ uint8_t qh[QK_K/8]; // quants, high bit
170
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
171
+ } block_q5_K;
172
+ static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
173
+ #else
174
  typedef struct {
175
+ half d; // super-block scale for quantized scales
176
+ half dmin; // super-block scale for quantized mins
177
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
178
  uint8_t qh[QK_K/8]; // quants, high bit
179
  uint8_t qs[QK_K/2]; // quants, low 4 bits
180
  } block_q5_K;
181
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
182
+ #endif
183
 
184
  typedef struct {
185
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
 
223
  dst[i] = x[i] + y[i];
224
  }
225
 
226
+ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
227
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
228
+
229
+ if (i >= k) {
230
+ return;
231
+ }
232
+ dst[i] = __hadd(x[i], __float2half(y[i]));
233
+ }
234
+
235
  static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
236
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
237
 
 
387
  static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
388
 
389
  const int i = blockIdx.x;
390
+ const block_q2_K * x = (const block_q2_K *) vx;
391
+
392
  const int tid = threadIdx.x;
393
+ #if QK_K == 256
394
  const int n = tid/32;
395
  const int l = tid - 32*n;
396
  const int is = 8*n + l/16;
397
 
 
 
398
  const uint8_t q = x[i].qs[32*n + l];
399
  float * y = yy + i*QK_K + 128*n;
400
 
 
404
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
405
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
406
  y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
407
+ #else
408
+ const int is = tid/16; // 0 or 1
409
+ const int il = tid%16; // 0...15
410
+ const uint8_t q = x[i].qs[il] >> (2*is);
411
+ float * y = yy + i*QK_K + 16*is + il;
412
+ float dall = x[i].d;
413
+ float dmin = x[i].dmin;
414
+ y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
415
+ y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
416
+ #endif
417
 
418
  }
419
 
420
  static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
421
 
422
+ const int i = blockIdx.x;
 
 
 
 
 
 
 
423
  const block_q3_K * x = (const block_q3_K *) vx;
424
 
425
+ #if QK_K == 256
426
+ const int r = threadIdx.x/4;
427
+ const int tid = r/2;
428
+ const int is0 = r%2;
429
+ const int l0 = 16*is0 + 4*(threadIdx.x%4);
430
+ const int n = tid / 4;
431
+ const int j = tid - 4*n;
432
+
433
  uint8_t m = 1 << (4*n + j);
434
  int is = 8*n + 2*j + is0;
435
  int shift = 2*j;
 
446
  const uint8_t * hm = x[i].hmask;
447
 
448
  for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
449
+ #else
450
+ const int tid = threadIdx.x;
451
+ const int is = tid/16; // 0 or 1
452
+ const int il = tid%16; // 0...15
453
+ const int im = il/8; // 0...1
454
+ const int in = il%8; // 0...7
455
+
456
+ float * y = yy + i*QK_K + 16*is + il;
457
+
458
+ const uint8_t q = x[i].qs[il] >> (2*is);
459
+ const uint8_t h = x[i].hmask[in] >> (2*is + im);
460
+ const float d = (float)x[i].d;
461
+
462
+ if (is == 0) {
463
+ y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
464
+ y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
465
+ } else {
466
+ y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
467
+ y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
468
+ }
469
+ #endif
470
 
471
  }
472
 
473
+ #if QK_K == 256
474
  static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
475
  if (j < 4) {
476
  d = q[j] & 63; m = q[j + 4] & 63;
 
479
  m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
480
  }
481
  }
482
+ #endif
483
 
484
  static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
485
  const block_q4_K * x = (const block_q4_K *) vx;
486
 
487
  const int i = blockIdx.x;
488
 
489
+ #if QK_K == 256
 
 
 
 
 
 
490
  // assume 32 threads
491
  const int tid = threadIdx.x;
492
  const int il = tid/8;
 
510
  y[l + 0] = d1 * (q[l] & 0xF) - m1;
511
  y[l +32] = d2 * (q[l] >> 4) - m2;
512
  }
513
+ #else
514
+ const int tid = threadIdx.x;
515
+ const uint8_t * q = x[i].qs;
516
+ float * y = yy + i*QK_K;
517
+ const float d = (float)x[i].d[0];
518
+ const float m = (float)x[i].d[1];
519
+ y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
520
+ y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
521
+ #endif
522
  }
523
 
524
  static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
 
526
 
527
  const int i = blockIdx.x;
528
 
529
+ #if QK_K == 256
530
  // assume 64 threads - this is very slightly better than the one below
531
  const int tid = threadIdx.x;
532
  const int il = tid/16; // il is in 0...3
 
553
  hm <<= 1;
554
  y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
555
  y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
556
+ #else
557
+ const int tid = threadIdx.x;
558
+ const uint8_t q = x[i].qs[tid];
559
+ const int im = tid/8; // 0...3
560
+ const int in = tid%8; // 0...7
561
+ const int is = tid/16; // 0 or 1
562
+ const uint8_t h = x[i].qh[in] >> im;
563
+ const float d = x[i].d;
564
+ float * y = yy + i*QK_K + tid;
565
+ y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
566
+ y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
567
+ #endif
568
  }
569
 
570
  static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
571
  const block_q6_K * x = (const block_q6_K *) vx;
572
 
573
  const int i = blockIdx.x;
574
+ #if QK_K == 256
575
 
576
  // assume 64 threads - this is very slightly better than the one below
577
  const int tid = threadIdx.x;
 
591
  y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
592
  y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
593
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
594
+ #else
595
+
596
+ // assume 32 threads
597
+ const int tid = threadIdx.x;
598
+ const int ip = tid/16; // 0 or 1
599
+ const int il = tid - 16*ip; // 0...15
600
+
601
+ float * y = yy + i*QK_K + 16*ip + il;
602
+
603
+ const float d = x[i].d;
604
+
605
+ const uint8_t ql = x[i].ql[16*ip + il];
606
+ const uint8_t qh = x[i].qh[il] >> (2*ip);
607
+ const int8_t * sc = x[i].scales;
608
+
609
+ y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
610
+ y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
611
+ #endif
612
  }
613
 
614
  static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
 
623
 
624
  const block_q2_K * x = (const block_q2_K *)vx + ib0;
625
 
626
+ float tmp = 0; // partial sum for thread in warp
627
+
628
+ #if QK_K == 256
629
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
630
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
631
 
 
639
  const int s_offset = 8*im;
640
  const int y_offset = 128*im + l0;
641
 
 
 
642
  uint32_t aux[4];
643
  const uint8_t * d = (const uint8_t *)aux;
644
  const uint8_t * m = (const uint8_t *)(aux + 2);
 
674
  tmp += dall * sum1 - dmin * sum2;
675
 
676
  }
677
+ #else
678
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
679
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
680
+ const int offset = tid * K_QUANTS_PER_ITERATION;
681
+
682
+ uint32_t uaux[2];
683
+ const uint8_t * d = (const uint8_t *)uaux;
684
+
685
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
686
+
687
+ const float * y = yy + i * QK_K + offset;
688
+ const uint8_t * q = x[i].qs + offset;
689
+ const uint32_t * s = (const uint32_t *)x[i].scales;
690
+
691
+ uaux[0] = s[0] & 0x0f0f0f0f;
692
+ uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
693
+
694
+ const half2 * dh = (const half2 *)&x[i].d;
695
+
696
+ const float2 dall = __half22float2(dh[0]);
697
+
698
+ float sum1 = 0, sum2 = 0;
699
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
700
+ const uint8_t ql = q[l];
701
+ sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
702
+ + y[l+16] * d[1] * ((ql >> 2) & 3)
703
+ + y[l+32] * d[2] * ((ql >> 4) & 3)
704
+ + y[l+48] * d[3] * ((ql >> 6) & 3);
705
+ sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
706
+ }
707
+ tmp += dall.x * sum1 - dall.y * sum2;
708
+ }
709
+ #endif
710
 
711
  // sum up partial sums and write back result
712
  __syncthreads();
 
715
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
716
  }
717
 
718
+ if (threadIdx.x == 0) {
719
  dst[row] = tmp;
720
  }
721
  }
722
 
723
  static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
724
 
 
 
 
725
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
726
  if (row > nrows) return;
727
 
 
730
 
731
  const block_q3_K * x = (const block_q3_K *)vx + ib0;
732
 
733
+ float tmp = 0; // partial sum for thread in warp
734
+
735
+ #if QK_K == 256
736
+
737
+ const uint16_t kmask1 = 0x0303;
738
+ const uint16_t kmask2 = 0x0f0f;
739
+
740
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
741
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
742
 
 
756
 
757
  const uint16_t s_shift = 4*im;
758
 
 
 
759
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
760
 
761
  const float * y = yy + i * QK_K + y_offset;
 
784
  tmp += d * sum;
785
 
786
  }
787
+ #else
788
+
789
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
790
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
791
+ const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
792
+ const int in = offset/8; // 0 or 1
793
+ const int im = offset%8; // 0...7
794
+
795
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
796
+
797
+ const float * y = yy + i * QK_K + offset;
798
+ const uint8_t * q = x[i].qs + offset;
799
+ const uint8_t * s = x[i].scales;
800
+
801
+ const float dall = (float)x[i].d;
802
+
803
+ float sum = 0;
804
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
805
+ const uint8_t hl = x[i].hmask[im+l] >> in;
806
+ const uint8_t ql = q[l];
807
+ sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
808
+ + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
809
+ + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
810
+ + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
811
+ }
812
+ tmp += sum;
813
+ }
814
+ #endif
815
 
816
  // sum up partial sums and write back result
817
  __syncthreads();
 
820
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
821
  }
822
 
823
+ if (threadIdx.x == 0) {
824
  dst[row] = tmp;
825
  }
826
  }
827
 
828
  static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
829
 
 
 
 
 
830
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
831
  if (row > nrows) return;
832
  const int num_blocks_per_row = ncols / QK_K;
833
  const int ib0 = row*num_blocks_per_row;
834
 
835
+ const block_q4_K * x = (const block_q4_K *)vx + ib0;
836
+
837
+ #if QK_K == 256
838
+ const uint16_t kmask1 = 0x3f3f;
839
+ const uint16_t kmask2 = 0x0f0f;
840
+ const uint16_t kmask3 = 0xc0c0;
841
+
842
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
843
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
844
 
 
858
  uint16_t aux[4];
859
  const uint8_t * sc = (const uint8_t *)aux;
860
 
 
 
861
  float tmp = 0; // partial sum for thread in warp
862
 
863
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
 
886
  tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
887
 
888
  }
889
+ #else
890
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
891
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
892
+
893
+ const int step = tid * K_QUANTS_PER_ITERATION;
894
+
895
+ uint16_t aux16[2];
896
+ const uint8_t * s = (const uint8_t *)aux16;
897
+
898
+ float tmp = 0;
899
+
900
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
901
+ const uint8_t * q = x[i].qs + step;
902
+ const float * y = yy + i*QK_K + step;
903
+ const uint16_t * a = (const uint16_t *)x[i].scales;
904
+ aux16[0] = a[0] & 0x0f0f;
905
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
906
+ const float d = (float)x[i].d[0];
907
+ const float m = (float)x[i].d[1];
908
+ float sum = 0.f;
909
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
910
+ sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
911
+ + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
912
+ + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
913
+ + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
914
+ }
915
+ tmp += sum;
916
+ }
917
+
918
+ #endif
919
 
920
  // sum up partial sums and write back result
921
  __syncthreads();
 
931
 
932
  static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
933
 
 
 
 
 
 
934
  const int row = blockIdx.x;
935
  const int num_blocks_per_row = ncols / QK_K;
936
  const int ib0 = row*num_blocks_per_row;
937
 
938
+ const block_q5_K * x = (const block_q5_K *)vx + ib0;
939
+
940
+ float tmp = 0; // partial sum for thread in warp
941
+
942
+ #if QK_K == 256
943
+ const uint16_t kmask1 = 0x3f3f;
944
+ const uint16_t kmask2 = 0x0f0f;
945
+ const uint16_t kmask3 = 0xc0c0;
946
+
947
  const int tid = threadIdx.x/2; // 0...15
948
  const int ix = threadIdx.x%2;
949
 
 
964
  uint16_t aux[4];
965
  const uint8_t * sc = (const uint8_t *)aux;
966
 
 
 
 
 
967
  for (int i = ix; i < num_blocks_per_row; i += 2) {
968
 
969
  const uint8_t * ql1 = x[i].qs + q_offset;
 
996
  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
997
  }
998
  tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
999
+ }
1000
 
1001
+ #else
1002
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
1003
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
1004
+ const int step = tid * K_QUANTS_PER_ITERATION;
1005
+ const int im = step/8;
1006
+ const int in = step%8;
1007
+
1008
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
1009
+ const uint8_t * q = x[i].qs + step;
1010
+ const int8_t * s = x[i].scales;
1011
+ const float * y = yy + i*QK_K + step;
1012
+ const float d = x[i].d;
1013
+ float sum = 0.f;
1014
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1015
+ const uint8_t h = x[i].qh[in+j] >> im;
1016
+ sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
1017
+ + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
1018
+ + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
1019
+ + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
1020
+ }
1021
+ tmp += sum;
1022
  }
1023
+ #endif
1024
 
1025
  // sum up partial sums and write back result
1026
  __syncthreads();
 
1029
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
1030
  }
1031
 
1032
+ if (threadIdx.x == 0) {
1033
  dst[row] = tmp;
1034
  }
1035
  }
 
1046
 
1047
  const block_q6_K * x = (const block_q6_K *)vx + ib0;
1048
 
1049
+ #if QK_K == 256
1050
+
1051
  const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
1052
  const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
1053
 
 
1102
 
1103
  }
1104
 
1105
+ #else
1106
+
1107
+ const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
1108
+ const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
1109
+
1110
+ const int step = tid * K_QUANTS_PER_ITERATION;
1111
+
1112
+ float tmp = 0; // partial sum for thread in warp
1113
+
1114
+ for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
1115
+
1116
+ const float * y = yy + i * QK_K + step;
1117
+ const uint8_t * ql = x[i].ql + step;
1118
+ const uint8_t * qh = x[i].qh + step;
1119
+ const int8_t * s = x[i].scales;
1120
+
1121
+ const float d = x[i+0].d;
1122
+
1123
+ float sum = 0;
1124
+ for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1125
+ sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
1126
+ + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
1127
+ + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
1128
+ + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
1129
+ }
1130
+ tmp += sum;
1131
+
1132
+ }
1133
+
1134
+ #endif
1135
+
1136
  // sum up partial sums and write back result
1137
  __syncthreads();
1138
  #pragma unroll
 
1244
  }
1245
 
1246
  static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1247
+ const half * x = (const half *) vx;
1248
 
1249
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1250
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
 
1292
 
1293
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1294
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
1295
+ const int row_stride_x, const int channel_stride_x) {
1296
 
1297
+ const half * x = (const half *) vx;
1298
 
1299
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1300
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
 
1337
  }
1338
 
1339
  static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1340
+ const float * xi = (const float *) cxi;
1341
  float * dsti = (float *) cdsti;
1342
 
1343
  *dsti = *xi;
1344
  }
1345
 
1346
  static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
1347
+ const float * xi = (const float *) cxi;
1348
  half * dsti = (half *) cdsti;
1349
 
1350
  *dsti = __float2half(*xi);
 
1468
  add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1469
  }
1470
 
1471
+ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
1472
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
1473
+ add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
1474
+ }
1475
+
1476
  static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
1477
  const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
1478
  mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 
1516
 
1517
  static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1518
  const int nb = k / QK_K;
1519
+ #if QK_K == 256
1520
  dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
1521
+ #else
1522
+ dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
1523
+ #endif
1524
  }
1525
 
1526
  static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1527
  const int nb = k / QK_K;
1528
+ #if QK_K == 256
1529
  dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
1530
+ #else
1531
+ dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
1532
+ #endif
1533
  }
1534
 
1535
  static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
 
1539
 
1540
  static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1541
  const int nb = k / QK_K;
1542
+ #if QK_K == 256
1543
  dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
1544
+ #else
1545
+ dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
1546
+ #endif
1547
  }
1548
 
1549
  static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
1550
  const int nb = k / QK_K;
1551
+ #if QK_K == 256
1552
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
1553
+ #else
1554
+ dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
1555
+ #endif
1556
  }
1557
 
1558
  static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 
1698
  const dim3 block_nums(1, nrows_x, nchannels_x);
1699
  const dim3 block_dims(WARP_SIZE, 1, 1);
1700
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
1701
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
1702
  }
1703
 
1704
  static void ggml_cpy_f32_f32_cuda(
 
1777
  int id;
1778
  CUDA_CHECK(cudaGetDevice(&id));
1779
 
1780
+ int best_i = -1;
1781
+ size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
1782
+ int worst_i = -1;
1783
+ size_t worst_size = 0; //largest unused buffer seen so far
1784
+
1785
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
1786
  cuda_buffer& b = g_cuda_buffer_pool[id][i];
1787
+ if (b.size > 0 && b.size >= size && b.size < best_size)
1788
+ {
1789
+ best_i = i;
1790
+ best_size = b.size;
 
 
1791
  }
1792
+ if (b.size > 0 && b.size > worst_size)
1793
+ {
1794
+ worst_i = i;
1795
+ worst_size = b.size;
1796
+ }
1797
+ }
1798
+ if(best_i!=-1) //found the smallest buffer that fits our needs
1799
+ {
1800
+ cuda_buffer& b = g_cuda_buffer_pool[id][best_i];
1801
+ void * ptr = b.ptr;
1802
+ *actual_size = b.size;
1803
+ b.ptr = nullptr;
1804
+ b.size = 0;
1805
+ return ptr;
1806
+ }
1807
+ if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
1808
+ {
1809
+ cuda_buffer& b = g_cuda_buffer_pool[id][worst_i];
1810
+ b.size = 0;
1811
+ void * ptr = b.ptr;
1812
+ cudaFree(ptr);
1813
+ b.ptr = ptr = nullptr;
1814
  }
1815
  void * ptr;
1816
  CUDA_CHECK(cudaMalloc((void **) &ptr, size));
 
1980
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
1981
  cudaStream_t & cudaStream_main){
1982
 
1983
+ GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
1984
  GGML_ASSERT(src1_ddf_i != nullptr);
1985
  GGML_ASSERT(dst_ddf_i != nullptr);
1986
 
 
1988
  const int64_t i01_diff = i01_high - i01_low;
1989
 
1990
  // compute
1991
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
1992
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
1993
+ } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
1994
+ add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
1995
+ } else {
1996
+ GGML_ASSERT(false);
1997
+ }
1998
  CUDA_CHECK(cudaGetLastError());
1999
 
2000
  (void) src1;
 
2220
  const int n_past = ((int32_t *) src1->data)[0];
2221
  const int n_dims = ((int32_t *) src1->data)[1];
2222
  const int mode = ((int32_t *) src1->data)[2];
2223
+ const int n_ctx = ((int32_t *) src1->data)[3];
2224
  GGML_ASSERT(mode == 0);
2225
 
2226
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
2227
+ const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
2228
+
2229
+ const float p = n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx;
2230
 
2231
  // compute
2232
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
 
2595
  }
2596
 
2597
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2598
+ // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
2599
+ // Due to flatten_rows == true this does in practice not make a difference however.
2600
+ // Better solution would be nice but right now that would require disproportionate changes.
2601
+ GGML_ASSERT(
2602
+ (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
2603
+ src1->type == GGML_TYPE_F32 &&
2604
+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
2605
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
2606
  }
2607
 
2608
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
2855
  delete extra;
2856
  }
2857
 
2858
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
2859
  if (scratch && g_scratch_size == 0) {
2860
  return;
2861
  }
 
2864
  if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
2865
  const ggml_op src0_op = tensor->src0->op;
2866
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
2867
+ ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
2868
  }
2869
  }
2870
  if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
2871
+ ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
2872
  }
2873
 
2874
  tensor->backend = GGML_BACKEND_GPU;
2875
  struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
2876
+ memset(extra, 0, sizeof(*extra));
2877
 
2878
  const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
2879
+ tensor->op == GGML_OP_VIEW ||
2880
+ force_inplace;
2881
  const size_t size = ggml_nbytes(tensor);
2882
 
2883
  CUDA_CHECK(cudaSetDevice(g_main_device));
2884
+ if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
2885
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
2886
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
2887
  size_t offset = 0;
 
2920
  }
2921
 
2922
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
2923
+ ggml_cuda_assign_buffers_impl(tensor, true, false);
2924
  }
2925
 
2926
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
2927
+ ggml_cuda_assign_buffers_impl(tensor, false, false);
2928
+ }
2929
+
2930
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
2931
+ ggml_cuda_assign_buffers_impl(tensor, false, true);
2932
  }
2933
 
2934
  void ggml_cuda_set_main_device(int main_device) {
ggml-cuda.h CHANGED
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
29
  void ggml_cuda_free_data(struct ggml_tensor * tensor);
30
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
31
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
 
32
  void ggml_cuda_set_main_device(int main_device);
33
  void ggml_cuda_set_scratch_size(size_t scratch_size);
34
  void ggml_cuda_free_scratch(void);
 
29
  void ggml_cuda_free_data(struct ggml_tensor * tensor);
30
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
31
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
32
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
33
  void ggml_cuda_set_main_device(int main_device);
34
  void ggml_cuda_set_scratch_size(size_t scratch_size);
35
  void ggml_cuda_free_scratch(void);
ggml-metal.m CHANGED
@@ -51,21 +51,21 @@ struct ggml_metal_context {
51
  GGML_METAL_DECL_KERNEL(get_rows_f16);
52
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
53
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
54
- GGML_METAL_DECL_KERNEL(get_rows_q2_k);
55
- GGML_METAL_DECL_KERNEL(get_rows_q3_k);
56
- GGML_METAL_DECL_KERNEL(get_rows_q4_k);
57
- GGML_METAL_DECL_KERNEL(get_rows_q5_k);
58
- GGML_METAL_DECL_KERNEL(get_rows_q6_k);
59
  GGML_METAL_DECL_KERNEL(rms_norm);
60
  GGML_METAL_DECL_KERNEL(norm);
61
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
63
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
64
- GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
65
- GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
66
- GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
67
- GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
68
- GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
69
  GGML_METAL_DECL_KERNEL(rope);
70
  GGML_METAL_DECL_KERNEL(alibi_f32);
71
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
132
  exit(1);
133
  }
134
 
 
 
 
 
 
135
  ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
 
136
  if (error) {
137
  fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
138
  exit(1);
@@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
159
  GGML_METAL_ADD_KERNEL(get_rows_f16);
160
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
161
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
162
- GGML_METAL_ADD_KERNEL(get_rows_q2_k);
163
- GGML_METAL_ADD_KERNEL(get_rows_q3_k);
164
- GGML_METAL_ADD_KERNEL(get_rows_q4_k);
165
- GGML_METAL_ADD_KERNEL(get_rows_q5_k);
166
- GGML_METAL_ADD_KERNEL(get_rows_q6_k);
167
  GGML_METAL_ADD_KERNEL(rms_norm);
168
  GGML_METAL_ADD_KERNEL(norm);
169
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
170
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
171
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
172
- GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
173
- GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
174
- GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
175
- GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
176
- GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
177
  GGML_METAL_ADD_KERNEL(rope);
178
  GGML_METAL_ADD_KERNEL(alibi_f32);
179
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
662
 
663
  nth0 = 4;
664
  nth1 = 16;
665
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
666
  } break;
667
  case GGML_TYPE_Q3_K:
668
  {
@@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
671
 
672
  nth0 = 4;
673
  nth1 = 16;
674
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
675
  } break;
676
  case GGML_TYPE_Q4_K:
677
  {
@@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
680
 
681
  nth0 = 4;
682
  nth1 = 16;
683
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
684
  } break;
685
  case GGML_TYPE_Q5_K:
686
  {
@@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
689
 
690
  nth0 = 4;
691
  nth1 = 16;
692
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
693
  } break;
694
  case GGML_TYPE_Q6_K:
695
  {
@@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
698
 
699
  nth0 = 4;
700
  nth1 = 16;
701
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
702
  } break;
703
  default:
704
  {
@@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
750
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
751
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
752
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
753
- case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
754
- case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
755
- case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
756
- case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
757
- case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
758
  default: GGML_ASSERT(false && "not implemented");
759
  }
760
 
 
51
  GGML_METAL_DECL_KERNEL(get_rows_f16);
52
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
53
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
54
+ GGML_METAL_DECL_KERNEL(get_rows_q2_K);
55
+ GGML_METAL_DECL_KERNEL(get_rows_q3_K);
56
+ GGML_METAL_DECL_KERNEL(get_rows_q4_K);
57
+ GGML_METAL_DECL_KERNEL(get_rows_q5_K);
58
+ GGML_METAL_DECL_KERNEL(get_rows_q6_K);
59
  GGML_METAL_DECL_KERNEL(rms_norm);
60
  GGML_METAL_DECL_KERNEL(norm);
61
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
63
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
64
+ GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
65
+ GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
66
+ GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
67
+ GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
68
+ GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
69
  GGML_METAL_DECL_KERNEL(rope);
70
  GGML_METAL_DECL_KERNEL(alibi_f32);
71
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
 
132
  exit(1);
133
  }
134
 
135
+ #ifdef GGML_QKK_64
136
+ MTLCompileOptions* options = [MTLCompileOptions new];
137
+ options.preprocessorMacros = @{ @"QK_K" : @(64) };
138
+ ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
139
+ #else
140
  ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
141
+ #endif
142
  if (error) {
143
  fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
144
  exit(1);
 
165
  GGML_METAL_ADD_KERNEL(get_rows_f16);
166
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
167
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
168
+ GGML_METAL_ADD_KERNEL(get_rows_q2_K);
169
+ GGML_METAL_ADD_KERNEL(get_rows_q3_K);
170
+ GGML_METAL_ADD_KERNEL(get_rows_q4_K);
171
+ GGML_METAL_ADD_KERNEL(get_rows_q5_K);
172
+ GGML_METAL_ADD_KERNEL(get_rows_q6_K);
173
  GGML_METAL_ADD_KERNEL(rms_norm);
174
  GGML_METAL_ADD_KERNEL(norm);
175
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
176
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
177
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
178
+ GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
179
+ GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
180
+ GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
181
+ GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
182
+ GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
183
  GGML_METAL_ADD_KERNEL(rope);
184
  GGML_METAL_ADD_KERNEL(alibi_f32);
185
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
 
668
 
669
  nth0 = 4;
670
  nth1 = 16;
671
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
672
  } break;
673
  case GGML_TYPE_Q3_K:
674
  {
 
677
 
678
  nth0 = 4;
679
  nth1 = 16;
680
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
681
  } break;
682
  case GGML_TYPE_Q4_K:
683
  {
 
686
 
687
  nth0 = 4;
688
  nth1 = 16;
689
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
690
  } break;
691
  case GGML_TYPE_Q5_K:
692
  {
 
695
 
696
  nth0 = 4;
697
  nth1 = 16;
698
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
699
  } break;
700
  case GGML_TYPE_Q6_K:
701
  {
 
704
 
705
  nth0 = 4;
706
  nth1 = 16;
707
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
708
  } break;
709
  default:
710
  {
 
756
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
757
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
758
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
759
+ case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
760
+ case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
761
+ case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
762
+ case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
763
+ case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
764
  default: GGML_ASSERT(false && "not implemented");
765
  }
766
 
ggml-metal.metal CHANGED
@@ -428,7 +428,7 @@ kernel void kernel_mul_mat_q4_0_f32(
428
  }
429
  threadgroup_barrier(mem_flags::mem_threadgroup);
430
  if (ith == 0) {
431
- for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
432
  dst[r1*ne0 + r0] = sum[0];
433
  }
434
  }
@@ -497,7 +497,7 @@ kernel void kernel_mul_mat_q4_1_f32(
497
  }
498
  threadgroup_barrier(mem_flags::mem_threadgroup);
499
  if (ith == 0) {
500
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
501
  dst[r1*ne0 + r0] = sum[0];
502
  }
503
  }
@@ -775,47 +775,76 @@ kernel void kernel_cpy_f32_f32(
775
 
776
  //============================================ k-quants ======================================================
777
 
 
778
  #define QK_K 256
 
 
 
 
 
 
 
 
 
779
 
780
  typedef struct {
781
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
782
  uint8_t qs[QK_K/4]; // quants
783
  half d; // super-block scale for quantized scales
784
  half dmin; // super-block scale for quantized mins
785
- } block_q2_k;
786
  // 84 bytes / block
787
 
788
  typedef struct {
789
  uint8_t hmask[QK_K/8]; // quants - high bit
790
  uint8_t qs[QK_K/4]; // quants - low 2 bits
791
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
792
- half d; // super-block scale
793
- } block_q3_k;
794
- // 110 bytes / block
795
-
 
 
 
 
 
 
 
 
 
 
796
  typedef struct {
797
  half d; // super-block scale for quantized scales
798
  half dmin; // super-block scale for quantized mins
799
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
800
  uint8_t qs[QK_K/2]; // 4--bit quants
801
- } block_q4_k;
802
- // 144 bytes / block
803
 
 
 
 
 
 
 
 
 
804
  typedef struct {
805
  half d; // super-block scale for quantized scales
806
  half dmin; // super-block scale for quantized mins
807
  uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
808
  uint8_t qh[QK_K/8]; // quants, high bit
809
  uint8_t qs[QK_K/2]; // quants, low 4 bits
810
- } block_q5_k;
811
  // 176 bytes / block
 
812
 
813
  typedef struct {
814
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
815
  uint8_t qh[QK_K/4]; // quants, upper 2 bits
816
  int8_t scales[QK_K/16]; // scales, quantized with 8 bits
817
  half d; // super-block scale
818
- } block_q6_k;
819
  // 210 bytes / block
820
 
821
  static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
@@ -836,7 +865,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
836
 
837
  //========================================== dequantization =============================
838
 
839
- static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, int k) {
840
  assert(k % QK_K == 0);
841
  const int nb = k / QK_K;
842
 
@@ -847,6 +876,7 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
847
 
848
  device const uint8_t * q = x[i].qs;
849
 
 
850
  int is = 0;
851
  float dl, ml;
852
  for (int n = 0; n < QK_K; n += 128) {
@@ -865,14 +895,29 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
865
  }
866
  q += 32;
867
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
868
 
869
  }
870
  }
871
 
872
- static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) {
873
  assert(k % QK_K == 0);
874
  const int nb = k / QK_K;
875
 
 
 
876
  const uint16_t kmask1 = 0x0303;
877
  const uint16_t kmask2 = 0x0f0f;
878
 
@@ -918,22 +963,49 @@ static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, i
918
  }
919
  q += 32;
920
  }
 
 
 
921
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
  }
 
923
 
924
  }
925
 
926
- static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) {
927
  assert(k % QK_K == 0);
928
  const int nb = k / QK_K;
929
 
930
-
931
  for (int i = 0; i < nb; i++) {
932
 
 
 
 
933
  const float d = x[i].d;
934
  const float min = x[i].dmin;
935
 
936
- device const uint8_t * q = x[i].qs;
937
  device const uint8_t * scales = x[i].scales;
938
 
939
  int is = 0;
@@ -945,14 +1017,29 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i
945
  for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
946
  q += 32; is += 2;
947
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948
 
949
  }
950
  }
951
 
952
- static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) {
953
  assert(k % QK_K == 0);
954
  const int nb = k / QK_K;
955
 
 
956
  for (int i = 0; i < nb; i++) {
957
 
958
  const float d = (float)(x[i].d);
@@ -973,10 +1060,32 @@ static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, i
973
  u1 <<= 2; u2 <<= 2;
974
  }
975
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
976
 
977
  }
978
 
979
- static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) {
980
  assert(k % QK_K == 0);
981
  const int nb = k / QK_K;
982
 
@@ -988,6 +1097,7 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
988
 
989
  const float d = x[i].d;
990
 
 
991
  for (int n = 0; n < QK_K; n += 128) {
992
  for (int l = 0; l < 32; ++l) {
993
  int is = l/16;
@@ -1005,10 +1115,23 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
1005
  qh += 32;
1006
  sc += 8;
1007
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
  }
1009
  }
1010
 
1011
- kernel void kernel_get_rows_q2_k(
1012
  device const void * src0,
1013
  device const int * src1,
1014
  device float * dst,
@@ -1019,12 +1142,12 @@ kernel void kernel_get_rows_q2_k(
1019
  const int i = tpig;
1020
  const int r = ((device int32_t *) src1)[i];
1021
 
1022
- dequantize_row_q2_k(
1023
- (device const block_q2_k *) ((device char *) src0 + r*nb01),
1024
  (device float *) ((device char *) dst + i*nb1), ne00);
1025
  }
1026
 
1027
- kernel void kernel_get_rows_q3_k(
1028
  device const void * src0,
1029
  device const int * src1,
1030
  device float * dst,
@@ -1035,12 +1158,12 @@ kernel void kernel_get_rows_q3_k(
1035
  const int i = tpig;
1036
  const int r = ((device int32_t *) src1)[i];
1037
 
1038
- dequantize_row_q3_k(
1039
- (device const block_q3_k *) ((device char *) src0 + r*nb01),
1040
  (device float *) ((device char *) dst + i*nb1), ne00);
1041
  }
1042
 
1043
- kernel void kernel_get_rows_q4_k(
1044
  device const void * src0,
1045
  device const int * src1,
1046
  device float * dst,
@@ -1051,12 +1174,12 @@ kernel void kernel_get_rows_q4_k(
1051
  const int i = tpig;
1052
  const int r = ((device int32_t *) src1)[i];
1053
 
1054
- dequantize_row_q4_k(
1055
- (device const block_q4_k *) ((device char *) src0 + r*nb01),
1056
  (device float *) ((device char *) dst + i*nb1), ne00);
1057
  }
1058
 
1059
- kernel void kernel_get_rows_q5_k(
1060
  device const void * src0,
1061
  device const int * src1,
1062
  device float * dst,
@@ -1067,12 +1190,12 @@ kernel void kernel_get_rows_q5_k(
1067
  const int i = tpig;
1068
  const int r = ((device int32_t *) src1)[i];
1069
 
1070
- dequantize_row_q5_k(
1071
- (device const block_q5_k *) ((device char *) src0 + r*nb01),
1072
  (device float *) ((device char *) dst + i*nb1), ne00);
1073
  }
1074
 
1075
- kernel void kernel_get_rows_q6_k(
1076
  device const void * src0,
1077
  device const int * src1,
1078
  device float * dst,
@@ -1083,14 +1206,14 @@ kernel void kernel_get_rows_q6_k(
1083
  const int i = tpig;
1084
  const int r = ((device int32_t *) src1)[i];
1085
 
1086
- dequantize_row_q6_k(
1087
- (device const block_q6_k *) ((device char *) src0 + r*nb01),
1088
  (device float *) ((device char *) dst + i*nb1), ne00);
1089
  }
1090
 
1091
  //====================================== dot products =========================
1092
 
1093
- kernel void kernel_mul_mat_q2_k_f32(
1094
  device const void * src0,
1095
  device const float * src1,
1096
  device float * dst,
@@ -1107,12 +1230,15 @@ kernel void kernel_mul_mat_q2_k_f32(
1107
  const int64_t r0 = tgpig.x;
1108
  const int64_t r1 = tgpig.y;
1109
 
1110
- device const block_q2_k * x = (device const block_q2_k *) src0 + r0*nb;
1111
  device const float * yy = (device const float *) src1 + r1*ne10;
1112
 
1113
  const int nth = tptg.x*tptg.y;
1114
  const int ith = tptg.y*tpitg.x + tpitg.y;
1115
 
 
 
 
1116
  const int tid = tpitg.y; // 0...16
1117
  const int il = tid/4; // 0...3
1118
  const int ir = tid%4; // 0...3
@@ -1125,9 +1251,6 @@ kernel void kernel_mul_mat_q2_k_f32(
1125
  const int y_offset = 64*il + n*ir;
1126
  const int q_offset = 32*ip + n*ir;
1127
 
1128
- sum[ith] = 0.0f;
1129
-
1130
- float sumf = 0;
1131
  for (int i = tpitg.x; i < nb; i += tptg.x) {
1132
 
1133
  device const uint8_t * q = x[i].qs + q_offset;
@@ -1140,7 +1263,6 @@ kernel void kernel_mul_mat_q2_k_f32(
1140
 
1141
  device const float * y = yy + i*QK_K + y_offset;
1142
 
1143
- //float4 s = {0.f, 0.f, 0.f, 0.f};
1144
  float2 s = {0.f, 0.f};
1145
  float smin = 0;
1146
  for (int l = 0; l < n; ++l) {
@@ -1155,25 +1277,38 @@ kernel void kernel_mul_mat_q2_k_f32(
1155
  sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;
1156
 
1157
  }
1158
- sum[ith] = sumf;
 
1159
 
1160
- //int mask1 = (ith%4 == 0);
1161
- //int mask2 = (ith%16 == 0);
 
1162
 
1163
- //threadgroup_barrier(mem_flags::mem_threadgroup);
1164
- //for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i];
1165
- //threadgroup_barrier(mem_flags::mem_threadgroup);
1166
- //for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i];
1167
- //threadgroup_barrier(mem_flags::mem_threadgroup);
1168
- //if (ith == 0) {
1169
- // for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
1170
- // dst[r1*ne0 + r0] = sum[0];
1171
- //}
 
 
 
 
 
 
 
 
 
 
 
 
 
1172
 
1173
  //
1174
  // Accumulate the sum from all threads in the threadgroup
1175
- // This version is slightly faster than the commented out one below,
1176
- // which I copy-pasted from ggerganov's q4_0 dot product for metal.
1177
  //
1178
  threadgroup_barrier(mem_flags::mem_threadgroup);
1179
  if (ith%4 == 0) {
@@ -1190,7 +1325,7 @@ kernel void kernel_mul_mat_q2_k_f32(
1190
  }
1191
  }
1192
 
1193
- kernel void kernel_mul_mat_q3_k_f32(
1194
  device const void * src0,
1195
  device const float * src1,
1196
  device float * dst,
@@ -1203,23 +1338,25 @@ kernel void kernel_mul_mat_q3_k_f32(
1203
  uint2 tpitg[[thread_position_in_threadgroup]],
1204
  uint2 tptg[[threads_per_threadgroup]]) {
1205
 
1206
- const uint16_t kmask1 = 0x0303;
1207
- const uint16_t kmask2 = 0x0f0f;
1208
-
1209
- const uint8_t m3 = 3;
1210
- const int8_t m4 = 4;
1211
-
1212
  const int nb = ne00/QK_K;
1213
 
1214
  const int64_t r0 = tgpig.x;
1215
  const int64_t r1 = tgpig.y;
1216
 
1217
- device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb;
1218
  device const float * yy = (device const float *) src1 + r1*ne10;
1219
 
1220
  const int nth = tptg.x*tptg.y;
1221
  const int ith = tptg.y*tpitg.x + tpitg.y;
1222
 
 
 
 
 
 
 
 
 
1223
  const int tid = tpitg.y; // expecting 16
1224
  const int ip = tid/8; // 0 or 1
1225
  const int il = tid/2 - 4*ip; // 0...3
@@ -1273,6 +1410,39 @@ kernel void kernel_mul_mat_q3_k_f32(
1273
 
1274
  //sum[ith] = sumf;
1275
  sum[ith] = sumf1 - 32.f*sumf2;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1276
 
1277
  //
1278
  // Accumulate the sum from all threads in the threadgroup
@@ -1293,7 +1463,7 @@ kernel void kernel_mul_mat_q3_k_f32(
1293
 
1294
  }
1295
 
1296
- kernel void kernel_mul_mat_q4_k_f32(
1297
  device const void * src0,
1298
  device const float * src1,
1299
  device float * dst,
@@ -1305,21 +1475,25 @@ kernel void kernel_mul_mat_q4_k_f32(
1305
  uint2 tpitg[[thread_position_in_threadgroup]],
1306
  uint2 tptg[[threads_per_threadgroup]]) {
1307
 
1308
- const uint16_t kmask1 = 0x3f3f;
1309
- const uint16_t kmask2 = 0x0f0f;
1310
- const uint16_t kmask3 = 0xc0c0;
1311
-
1312
  const int nb = ne00/QK_K;
1313
 
1314
  const int64_t r0 = tgpig.x;
1315
  const int64_t r1 = tgpig.y;
1316
 
1317
- device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
1318
- device const float * yy = (device const float *) src1 + r1*ne10;
1319
-
1320
  const int nth = tptg.x*tptg.y;
1321
  const int ith = tptg.y*tpitg.x + tpitg.y;
1322
 
 
 
 
 
 
 
 
 
 
 
 
1323
  const int tid = tpitg.y; // 0...16
1324
  const int il = tid/4; // 0...3
1325
  const int ir = tid - 4*il;// 0...3
@@ -1332,11 +1506,8 @@ kernel void kernel_mul_mat_q4_k_f32(
1332
  const int q_offset = 32*im + l0;
1333
  const int y_offset = 64*im + l0;
1334
 
1335
- sum[ith] = 0.0f;
1336
-
1337
  uchar2 sc1, sc2, sc3, sc4;
1338
 
1339
- float sumf = 0;
1340
  for (int i = tpitg.x; i < nb; i += tptg.x) {
1341
 
1342
  device const uint8_t * q1 = (x + i)->qs + q_offset;
@@ -1365,6 +1536,30 @@ kernel void kernel_mul_mat_q4_k_f32(
1365
  sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
1366
 
1367
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1368
 
1369
  sum[ith] = sumf;
1370
 
@@ -1401,7 +1596,7 @@ kernel void kernel_mul_mat_q4_k_f32(
1401
  //}
1402
  }
1403
 
1404
- kernel void kernel_mul_mat_q5_k_f32(
1405
  device const void * src0,
1406
  device const float * src1,
1407
  device float * dst,
@@ -1413,21 +1608,25 @@ kernel void kernel_mul_mat_q5_k_f32(
1413
  uint2 tpitg[[thread_position_in_threadgroup]],
1414
  uint2 tptg[[threads_per_threadgroup]]) {
1415
 
1416
- const uint16_t kmask1 = 0x3f3f;
1417
- const uint16_t kmask2 = 0x0f0f;
1418
- const uint16_t kmask3 = 0xc0c0;
1419
-
1420
  const int nb = ne00/QK_K;
1421
 
1422
  const int64_t r0 = tgpig.x;
1423
  const int64_t r1 = tgpig.y;
1424
 
1425
- device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb;
1426
  device const float * yy = (device const float *) src1 + r1*ne10;
1427
 
1428
  const int nth = tptg.x*tptg.y;
1429
  const int ith = tptg.y*tpitg.x + tpitg.y;
1430
 
 
 
 
 
 
 
 
 
1431
  const int tid = tpitg.y; // 0...16
1432
  const int il = tid/4; // 0...3
1433
  const int ir = tid - 4*il;// 0...3
@@ -1447,7 +1646,6 @@ kernel void kernel_mul_mat_q5_k_f32(
1447
 
1448
  uchar2 sc1, sc2, sc3, sc4;
1449
 
1450
- float sumf = 0;
1451
  for (int i = tpitg.x; i < nb; i += tptg.x) {
1452
 
1453
  device const uint8_t * q1 = (x + i)->qs + q_offset;
@@ -1479,6 +1677,28 @@ kernel void kernel_mul_mat_q5_k_f32(
1479
  sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
1480
 
1481
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1482
  sum[ith] = sumf;
1483
 
1484
  //
@@ -1500,7 +1720,7 @@ kernel void kernel_mul_mat_q5_k_f32(
1500
 
1501
  }
1502
 
1503
- kernel void kernel_mul_mat_q6_k_f32(
1504
  device const void * src0,
1505
  device const float * src1,
1506
  device float * dst,
@@ -1522,12 +1742,15 @@ kernel void kernel_mul_mat_q6_k_f32(
1522
  const int64_t r0 = tgpig.x;
1523
  const int64_t r1 = tgpig.y;
1524
 
1525
- device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb;
1526
  device const float * yy = (device const float *) src1 + r1*ne10;
1527
 
1528
  const int nth = tptg.x*tptg.y;
1529
  const int ith = tptg.y*tpitg.x + tpitg.y;
1530
 
 
 
 
1531
  // Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
1532
  const int iqs = 16 * tpitg.y;
1533
  const int ip = iqs / 128; // 0 or 1
@@ -1540,7 +1763,6 @@ kernel void kernel_mul_mat_q6_k_f32(
1540
  const int q_offset_l = 64*ip + l0;
1541
  const int q_offset_h = 32*ip + l0;
1542
 
1543
- float sumf = 0;
1544
  for (int i = tpitg.x; i < nb; i += tptg.x) {
1545
 
1546
  device const uint8_t * ql = x[i].ql + q_offset_l;
@@ -1562,6 +1784,28 @@ kernel void kernel_mul_mat_q6_k_f32(
1562
  sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
1563
 
1564
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1565
 
1566
  sum[ith] = sumf;
1567
 
 
428
  }
429
  threadgroup_barrier(mem_flags::mem_threadgroup);
430
  if (ith == 0) {
431
+ for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
432
  dst[r1*ne0 + r0] = sum[0];
433
  }
434
  }
 
497
  }
498
  threadgroup_barrier(mem_flags::mem_threadgroup);
499
  if (ith == 0) {
500
+ for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
501
  dst[r1*ne0 + r0] = sum[0];
502
  }
503
  }
 
775
 
776
  //============================================ k-quants ======================================================
777
 
778
+ #ifndef QK_K
779
  #define QK_K 256
780
+ #else
781
+ static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
782
+ #endif
783
+
784
+ #if QK_K == 256
785
+ #define K_SCALE_SIZE 12
786
+ #else
787
+ #define K_SCALE_SIZE 4
788
+ #endif
789
 
790
  typedef struct {
791
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
792
  uint8_t qs[QK_K/4]; // quants
793
  half d; // super-block scale for quantized scales
794
  half dmin; // super-block scale for quantized mins
795
+ } block_q2_K;
796
  // 84 bytes / block
797
 
798
  typedef struct {
799
  uint8_t hmask[QK_K/8]; // quants - high bit
800
  uint8_t qs[QK_K/4]; // quants - low 2 bits
801
+ #if QK_K == 64
802
+ uint8_t scales[2];
803
+ #else
804
+ uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
805
+ #endif
806
+ half d; // super-block scale
807
+ } block_q3_K;
808
+
809
+ #if QK_K == 64
810
+ typedef struct {
811
+ half d[2]; // super-block scales/mins
812
+ uint8_t scales[2];
813
+ uint8_t qs[QK_K/2]; // 4-bit quants
814
+ } block_q4_K;
815
+ #else
816
  typedef struct {
817
  half d; // super-block scale for quantized scales
818
  half dmin; // super-block scale for quantized mins
819
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
820
  uint8_t qs[QK_K/2]; // 4--bit quants
821
+ } block_q4_K;
822
+ #endif
823
 
824
+ #if QK_K == 64
825
+ typedef struct {
826
+ half d; // super-block scales/mins
827
+ int8_t scales[QK_K/16]; // 8-bit block scales
828
+ uint8_t qh[QK_K/8]; // quants, high bit
829
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
830
+ } block_q5_K;
831
+ #else
832
  typedef struct {
833
  half d; // super-block scale for quantized scales
834
  half dmin; // super-block scale for quantized mins
835
  uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
836
  uint8_t qh[QK_K/8]; // quants, high bit
837
  uint8_t qs[QK_K/2]; // quants, low 4 bits
838
+ } block_q5_K;
839
  // 176 bytes / block
840
+ #endif
841
 
842
  typedef struct {
843
  uint8_t ql[QK_K/2]; // quants, lower 4 bits
844
  uint8_t qh[QK_K/4]; // quants, upper 2 bits
845
  int8_t scales[QK_K/16]; // scales, quantized with 8 bits
846
  half d; // super-block scale
847
+ } block_q6_K;
848
  // 210 bytes / block
849
 
850
  static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
 
865
 
866
  //========================================== dequantization =============================
867
 
868
+ static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) {
869
  assert(k % QK_K == 0);
870
  const int nb = k / QK_K;
871
 
 
876
 
877
  device const uint8_t * q = x[i].qs;
878
 
879
+ #if QK_K == 256
880
  int is = 0;
881
  float dl, ml;
882
  for (int n = 0; n < QK_K; n += 128) {
 
895
  }
896
  q += 32;
897
  }
898
+ #else
899
+ float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
900
+ float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
901
+ float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
902
+ float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
903
+ for (int l = 0; l < 16; ++l) {
904
+ y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1;
905
+ y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2;
906
+ y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3;
907
+ y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4;
908
+ }
909
+ y += QK_K;
910
+ #endif
911
 
912
  }
913
  }
914
 
915
+ static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) {
916
  assert(k % QK_K == 0);
917
  const int nb = k / QK_K;
918
 
919
+ #if QK_K == 256
920
+
921
  const uint16_t kmask1 = 0x0303;
922
  const uint16_t kmask2 = 0x0f0f;
923
 
 
963
  }
964
  q += 32;
965
  }
966
+ }
967
+ #else
968
+ for (int i = 0; i < nb; i++) {
969
 
970
+ const float d_all = (float)(x[i].d);
971
+
972
+ device const uint8_t * q = x[i].qs;
973
+ device const uint8_t * hm = x[i].hmask;
974
+
975
+ const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
976
+ const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
977
+ const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
978
+ const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
979
+
980
+ for (int l = 0; l < 8; ++l) {
981
+ uint8_t h = hm[l];
982
+ y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
983
+ y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
984
+ y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
985
+ y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
986
+ y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
987
+ y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
988
+ y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
989
+ y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
990
+ }
991
+ y += QK_K;
992
  }
993
+ #endif
994
 
995
  }
996
 
997
+ static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) {
998
  assert(k % QK_K == 0);
999
  const int nb = k / QK_K;
1000
 
 
1001
  for (int i = 0; i < nb; i++) {
1002
 
1003
+ device const uint8_t * q = x[i].qs;
1004
+
1005
+ #if QK_K == 256
1006
  const float d = x[i].d;
1007
  const float min = x[i].dmin;
1008
 
 
1009
  device const uint8_t * scales = x[i].scales;
1010
 
1011
  int is = 0;
 
1017
  for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
1018
  q += 32; is += 2;
1019
  }
1020
+ #else
1021
+ device const uint8_t * s = x[i].scales;
1022
+ device const half2 * dh = (device const half2 *)x[i].d;
1023
+ const float2 d = (float2)dh[0];
1024
+ const float d1 = d[0] * (s[0] & 0xF);
1025
+ const float d2 = d[0] * (s[1] & 0xF);
1026
+ const float m1 = d[1] * (s[0] >> 4);
1027
+ const float m2 = d[1] * (s[1] >> 4);
1028
+ for (int l = 0; l < 32; ++l) {
1029
+ y[l+ 0] = d1 * (q[l] & 0xF) - m1;
1030
+ y[l+32] = d2 * (q[l] >> 4) - m2;
1031
+ }
1032
+ y += QK_K;
1033
+ #endif
1034
 
1035
  }
1036
  }
1037
 
1038
+ static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) {
1039
  assert(k % QK_K == 0);
1040
  const int nb = k / QK_K;
1041
 
1042
+ #if QK_K == 256
1043
  for (int i = 0; i < nb; i++) {
1044
 
1045
  const float d = (float)(x[i].d);
 
1060
  u1 <<= 2; u2 <<= 2;
1061
  }
1062
  }
1063
+ #else
1064
+ for (int i = 0; i < nb; i++) {
1065
+
1066
+ const float d = (float)x[i].d;
1067
+
1068
+ device const uint8_t * ql = x[i].qs;
1069
+ device const uint8_t * qh = x[i].qh;
1070
+ device const int8_t * sc = x[i].scales;
1071
+
1072
+ for (int l = 0; l < 8; ++l) {
1073
+ y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
1074
+ y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
1075
+ y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
1076
+ y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
1077
+ y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16));
1078
+ y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16));
1079
+ y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16));
1080
+ y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16));
1081
+ }
1082
+ y += QK_K;
1083
+ }
1084
+ #endif
1085
 
1086
  }
1087
 
1088
+ static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) {
1089
  assert(k % QK_K == 0);
1090
  const int nb = k / QK_K;
1091
 
 
1097
 
1098
  const float d = x[i].d;
1099
 
1100
+ #if QK_K == 256
1101
  for (int n = 0; n < QK_K; n += 128) {
1102
  for (int l = 0; l < 32; ++l) {
1103
  int is = l/16;
 
1115
  qh += 32;
1116
  sc += 8;
1117
  }
1118
+ #else
1119
+ for (int l = 0; l < 16; ++l) {
1120
+ const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1121
+ const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1122
+ const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1123
+ const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1124
+ y[l+ 0] = d * sc[0] * q1;
1125
+ y[l+16] = d * sc[1] * q2;
1126
+ y[l+32] = d * sc[2] * q3;
1127
+ y[l+48] = d * sc[3] * q4;
1128
+ }
1129
+ y += 64;
1130
+ #endif
1131
  }
1132
  }
1133
 
1134
+ kernel void kernel_get_rows_q2_K(
1135
  device const void * src0,
1136
  device const int * src1,
1137
  device float * dst,
 
1142
  const int i = tpig;
1143
  const int r = ((device int32_t *) src1)[i];
1144
 
1145
+ dequantize_row_q2_K(
1146
+ (device const block_q2_K *) ((device char *) src0 + r*nb01),
1147
  (device float *) ((device char *) dst + i*nb1), ne00);
1148
  }
1149
 
1150
+ kernel void kernel_get_rows_q3_K(
1151
  device const void * src0,
1152
  device const int * src1,
1153
  device float * dst,
 
1158
  const int i = tpig;
1159
  const int r = ((device int32_t *) src1)[i];
1160
 
1161
+ dequantize_row_q3_K(
1162
+ (device const block_q3_K *) ((device char *) src0 + r*nb01),
1163
  (device float *) ((device char *) dst + i*nb1), ne00);
1164
  }
1165
 
1166
+ kernel void kernel_get_rows_q4_K(
1167
  device const void * src0,
1168
  device const int * src1,
1169
  device float * dst,
 
1174
  const int i = tpig;
1175
  const int r = ((device int32_t *) src1)[i];
1176
 
1177
+ dequantize_row_q4_K(
1178
+ (device const block_q4_K *) ((device char *) src0 + r*nb01),
1179
  (device float *) ((device char *) dst + i*nb1), ne00);
1180
  }
1181
 
1182
+ kernel void kernel_get_rows_q5_K(
1183
  device const void * src0,
1184
  device const int * src1,
1185
  device float * dst,
 
1190
  const int i = tpig;
1191
  const int r = ((device int32_t *) src1)[i];
1192
 
1193
+ dequantize_row_q5_K(
1194
+ (device const block_q5_K *) ((device char *) src0 + r*nb01),
1195
  (device float *) ((device char *) dst + i*nb1), ne00);
1196
  }
1197
 
1198
+ kernel void kernel_get_rows_q6_K(
1199
  device const void * src0,
1200
  device const int * src1,
1201
  device float * dst,
 
1206
  const int i = tpig;
1207
  const int r = ((device int32_t *) src1)[i];
1208
 
1209
+ dequantize_row_q6_K(
1210
+ (device const block_q6_K *) ((device char *) src0 + r*nb01),
1211
  (device float *) ((device char *) dst + i*nb1), ne00);
1212
  }
1213
 
1214
  //====================================== dot products =========================
1215
 
1216
+ kernel void kernel_mul_mat_q2_K_f32(
1217
  device const void * src0,
1218
  device const float * src1,
1219
  device float * dst,
 
1230
  const int64_t r0 = tgpig.x;
1231
  const int64_t r1 = tgpig.y;
1232
 
1233
+ device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb;
1234
  device const float * yy = (device const float *) src1 + r1*ne10;
1235
 
1236
  const int nth = tptg.x*tptg.y;
1237
  const int ith = tptg.y*tpitg.x + tpitg.y;
1238
 
1239
+ float sumf = 0;
1240
+
1241
+ #if QK_K == 256
1242
  const int tid = tpitg.y; // 0...16
1243
  const int il = tid/4; // 0...3
1244
  const int ir = tid%4; // 0...3
 
1251
  const int y_offset = 64*il + n*ir;
1252
  const int q_offset = 32*ip + n*ir;
1253
 
 
 
 
1254
  for (int i = tpitg.x; i < nb; i += tptg.x) {
1255
 
1256
  device const uint8_t * q = x[i].qs + q_offset;
 
1263
 
1264
  device const float * y = yy + i*QK_K + y_offset;
1265
 
 
1266
  float2 s = {0.f, 0.f};
1267
  float smin = 0;
1268
  for (int l = 0; l < n; ++l) {
 
1277
  sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;
1278
 
1279
  }
1280
+ #else
1281
+ const int il = 4 * tpitg.x;
1282
 
1283
+ uint32_t aux[2];
1284
+ thread const uint8_t * d = (thread const uint8_t *)aux;
1285
+ thread const uint8_t * m = (thread const uint8_t *)aux + 4;
1286
 
1287
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
1288
+
1289
+ device const uint8_t * q = x[i].qs + il;
1290
+ device const float * y = yy + i*QK_K + il;
1291
+
1292
+ const float dall = (float)x[i].d;
1293
+ const float dmin = (float)x[i].dmin;
1294
+
1295
+ device const uint32_t * a = (device const uint32_t *)x[i].scales;
1296
+ aux[0] = a[0] & 0x0f0f0f0f;
1297
+ aux[1] = (a[0] >> 4) & 0x0f0f0f0f;
1298
+
1299
+ for (int l = 0; l < 4; ++l) {
1300
+ sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0])
1301
+ + y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1])
1302
+ + y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2])
1303
+ + y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]);
1304
+ }
1305
+ }
1306
+ #endif
1307
+
1308
+ sum[ith] = sumf;
1309
 
1310
  //
1311
  // Accumulate the sum from all threads in the threadgroup
 
 
1312
  //
1313
  threadgroup_barrier(mem_flags::mem_threadgroup);
1314
  if (ith%4 == 0) {
 
1325
  }
1326
  }
1327
 
1328
+ kernel void kernel_mul_mat_q3_K_f32(
1329
  device const void * src0,
1330
  device const float * src1,
1331
  device float * dst,
 
1338
  uint2 tpitg[[thread_position_in_threadgroup]],
1339
  uint2 tptg[[threads_per_threadgroup]]) {
1340
 
 
 
 
 
 
 
1341
  const int nb = ne00/QK_K;
1342
 
1343
  const int64_t r0 = tgpig.x;
1344
  const int64_t r1 = tgpig.y;
1345
 
1346
+ device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb;
1347
  device const float * yy = (device const float *) src1 + r1*ne10;
1348
 
1349
  const int nth = tptg.x*tptg.y;
1350
  const int ith = tptg.y*tpitg.x + tpitg.y;
1351
 
1352
+ #if QK_K == 256
1353
+
1354
+ const uint8_t m3 = 3;
1355
+ const int8_t m4 = 4;
1356
+
1357
+ const uint16_t kmask1 = 0x0303;
1358
+ const uint16_t kmask2 = 0x0f0f;
1359
+
1360
  const int tid = tpitg.y; // expecting 16
1361
  const int ip = tid/8; // 0 or 1
1362
  const int il = tid/2 - 4*ip; // 0...3
 
1410
 
1411
  //sum[ith] = sumf;
1412
  sum[ith] = sumf1 - 32.f*sumf2;
1413
+ #else
1414
+ const int il = 4 * tpitg.x; // 0, 4, 8, 12
1415
+ const int im = il/8; // 0, 0, 1, 1
1416
+ const int in = il%8; // 0, 4, 0, 4
1417
+
1418
+ float sumf = 0;
1419
+
1420
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
1421
+
1422
+ const float d_all = (float)(x[i].d);
1423
+
1424
+ device const uint8_t * q = x[i].qs + il;
1425
+ device const uint8_t * h = x[i].hmask + in;
1426
+ device const float * y = yy + i * QK_K + il;
1427
+
1428
+ const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
1429
+ const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
1430
+ const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
1431
+ const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
1432
+
1433
+ for (int l = 0; l < 4; ++l) {
1434
+ const uint8_t hm = h[l] >> im;
1435
+ sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4))
1436
+ + y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4))
1437
+ + y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4))
1438
+ + y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4));
1439
+ }
1440
+
1441
+ }
1442
+
1443
+ sum[ith] = sumf;
1444
+
1445
+ #endif
1446
 
1447
  //
1448
  // Accumulate the sum from all threads in the threadgroup
 
1463
 
1464
  }
1465
 
1466
+ kernel void kernel_mul_mat_q4_K_f32(
1467
  device const void * src0,
1468
  device const float * src1,
1469
  device float * dst,
 
1475
  uint2 tpitg[[thread_position_in_threadgroup]],
1476
  uint2 tptg[[threads_per_threadgroup]]) {
1477
 
 
 
 
 
1478
  const int nb = ne00/QK_K;
1479
 
1480
  const int64_t r0 = tgpig.x;
1481
  const int64_t r1 = tgpig.y;
1482
 
 
 
 
1483
  const int nth = tptg.x*tptg.y;
1484
  const int ith = tptg.y*tpitg.x + tpitg.y;
1485
 
1486
+ device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
1487
+ device const float * yy = (device const float *) src1 + r1*ne10;
1488
+
1489
+ float sumf = 0;
1490
+
1491
+ #if QK_K == 256
1492
+
1493
+ const uint16_t kmask1 = 0x3f3f;
1494
+ const uint16_t kmask2 = 0x0f0f;
1495
+ const uint16_t kmask3 = 0xc0c0;
1496
+
1497
  const int tid = tpitg.y; // 0...16
1498
  const int il = tid/4; // 0...3
1499
  const int ir = tid - 4*il;// 0...3
 
1506
  const int q_offset = 32*im + l0;
1507
  const int y_offset = 64*im + l0;
1508
 
 
 
1509
  uchar2 sc1, sc2, sc3, sc4;
1510
 
 
1511
  for (int i = tpitg.x; i < nb; i += tptg.x) {
1512
 
1513
  device const uint8_t * q1 = (x + i)->qs + q_offset;
 
1536
  sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
1537
 
1538
  }
1539
+ #else
1540
+ uint16_t aux16[2];
1541
+ thread const uint8_t * scales = (thread const uint8_t *)aux16;
1542
+
1543
+ const int il = 4*tpitg.x;
1544
+
1545
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
1546
+
1547
+ device const uint8_t * q = x[i].qs + il;
1548
+ device const float * y = yy + i * QK_K + il;
1549
+
1550
+ const float d = (float)x[i].d[0];
1551
+ const float m = (float)x[i].d[1];
1552
+
1553
+ device const uint16_t * a = (device const uint16_t *)x[i].scales;
1554
+ aux16[0] = a[0] & 0x0f0f;
1555
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
1556
+
1557
+ for (int l = 0; l < 4; ++l) {
1558
+ sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
1559
+ + d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]);
1560
+ }
1561
+ }
1562
+ #endif
1563
 
1564
  sum[ith] = sumf;
1565
 
 
1596
  //}
1597
  }
1598
 
1599
+ kernel void kernel_mul_mat_q5_K_f32(
1600
  device const void * src0,
1601
  device const float * src1,
1602
  device float * dst,
 
1608
  uint2 tpitg[[thread_position_in_threadgroup]],
1609
  uint2 tptg[[threads_per_threadgroup]]) {
1610
 
 
 
 
 
1611
  const int nb = ne00/QK_K;
1612
 
1613
  const int64_t r0 = tgpig.x;
1614
  const int64_t r1 = tgpig.y;
1615
 
1616
+ device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
1617
  device const float * yy = (device const float *) src1 + r1*ne10;
1618
 
1619
  const int nth = tptg.x*tptg.y;
1620
  const int ith = tptg.y*tpitg.x + tpitg.y;
1621
 
1622
+ float sumf = 0;
1623
+
1624
+ #if QK_K == 256
1625
+
1626
+ const uint16_t kmask1 = 0x3f3f;
1627
+ const uint16_t kmask2 = 0x0f0f;
1628
+ const uint16_t kmask3 = 0xc0c0;
1629
+
1630
  const int tid = tpitg.y; // 0...16
1631
  const int il = tid/4; // 0...3
1632
  const int ir = tid - 4*il;// 0...3
 
1646
 
1647
  uchar2 sc1, sc2, sc3, sc4;
1648
 
 
1649
  for (int i = tpitg.x; i < nb; i += tptg.x) {
1650
 
1651
  device const uint8_t * q1 = (x + i)->qs + q_offset;
 
1677
  sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
1678
 
1679
  }
1680
+ #else
1681
+ const int il = 4 * tpitg.x; // 0, 4, 8, 12
1682
+ const int im = il/8; // 0, 0, 1, 1
1683
+ const int in = il%8; // 0, 4, 0, 4
1684
+
1685
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
1686
+
1687
+ const float d = (float)x[i].d;
1688
+ device const uint8_t * q = x[i].qs + il;
1689
+ device const uint8_t * h = x[i].qh + in;
1690
+ device const int8_t * s = x[i].scales;
1691
+ device const float * y = yy + i*QK_K + il;
1692
+
1693
+ for (int l = 0; l < 4; ++l) {
1694
+ const uint8_t hl = h[l] >> im;
1695
+ sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
1696
+ + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
1697
+ + y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16))
1698
+ + y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16));
1699
+ }
1700
+ }
1701
+ #endif
1702
  sum[ith] = sumf;
1703
 
1704
  //
 
1720
 
1721
  }
1722
 
1723
+ kernel void kernel_mul_mat_q6_K_f32(
1724
  device const void * src0,
1725
  device const float * src1,
1726
  device float * dst,
 
1742
  const int64_t r0 = tgpig.x;
1743
  const int64_t r1 = tgpig.y;
1744
 
1745
+ device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
1746
  device const float * yy = (device const float *) src1 + r1*ne10;
1747
 
1748
  const int nth = tptg.x*tptg.y;
1749
  const int ith = tptg.y*tpitg.x + tpitg.y;
1750
 
1751
+ float sumf = 0;
1752
+
1753
+ #if QK_K == 256
1754
  // Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
1755
  const int iqs = 16 * tpitg.y;
1756
  const int ip = iqs / 128; // 0 or 1
 
1763
  const int q_offset_l = 64*ip + l0;
1764
  const int q_offset_h = 32*ip + l0;
1765
 
 
1766
  for (int i = tpitg.x; i < nb; i += tptg.x) {
1767
 
1768
  device const uint8_t * ql = x[i].ql + q_offset_l;
 
1784
  sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
1785
 
1786
  }
1787
+ #else
1788
+ const int il = 4*tpitg.x; // 0, 4, 8, 12
1789
+
1790
+ for (int i = tpitg.y; i < nb; i += tptg.y) {
1791
+ device const float * y = yy + i * QK_K + il;
1792
+ device const uint8_t * ql = x[i].ql + il;
1793
+ device const uint8_t * qh = x[i].qh + il;
1794
+ device const int8_t * s = x[i].scales;
1795
+
1796
+ const float d = x[i].d;
1797
+
1798
+ float4 sums = {0.f, 0.f, 0.f, 0.f};
1799
+ for (int l = 0; l < 4; ++l) {
1800
+ sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
1801
+ sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
1802
+ sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
1803
+ sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
1804
+ }
1805
+ sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
1806
+ }
1807
+
1808
+ #endif
1809
 
1810
  sum[ith] = sumf;
1811
 
ggml.c CHANGED
@@ -1,5 +1,5 @@
1
- // Defines CLOCK_MONOTONIC on Linux
2
- #define _GNU_SOURCE
3
 
4
  #include "ggml.h"
5
 
@@ -91,6 +91,11 @@ static int sched_yield (void) {
91
  #include <stdatomic.h>
92
 
93
  typedef void* thread_ret_t;
 
 
 
 
 
94
  #endif
95
 
96
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -119,6 +124,30 @@ typedef void* thread_ret_t;
119
  #define GGML_SOFT_MAX_UNROLL 4
120
  #define GGML_VEC_DOT_UNROLL 2
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  #ifdef GGML_USE_ACCELERATE
123
  // uncomment to use vDSP for soft max computation
124
  // note: not sure if it is actually faster
@@ -131,6 +160,34 @@ typedef void* thread_ret_t;
131
  #define GGML_MEM_ALIGN 16
132
  #endif
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  #if defined(_MSC_VER) || defined(__MINGW32__)
135
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
136
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -144,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
144
  #endif
145
  if (result != 0) {
146
  // Handle allocation failure
 
 
 
 
 
 
 
 
 
 
 
147
  return NULL;
148
  }
149
  return aligned_memory;
@@ -420,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
420
  }
421
  }
422
 
423
-
424
  //
425
  // timing
426
  //
@@ -483,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
483
  #define ggml_perf_cycles_per_ms() 0
484
  #endif
485
 
 
486
  //
487
  // cache line
488
  //
@@ -3530,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
3530
  *s = 1.f/(*s);
3531
  }
3532
 
3533
- //
3534
- // logging
3535
- //
3536
-
3537
- #if (GGML_DEBUG >= 1)
3538
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
3539
- #else
3540
- #define GGML_PRINT_DEBUG(...)
3541
- #endif
3542
-
3543
- #if (GGML_DEBUG >= 5)
3544
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
3545
- #else
3546
- #define GGML_PRINT_DEBUG_5(...)
3547
- #endif
3548
-
3549
- #if (GGML_DEBUG >= 10)
3550
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
3551
- #else
3552
- #define GGML_PRINT_DEBUG_10(...)
3553
- #endif
3554
-
3555
- #define GGML_PRINT(...) printf(__VA_ARGS__)
3556
-
3557
  //
3558
  // data types
3559
  //
@@ -3713,11 +3757,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3713
  "MAP_UNARY",
3714
  "MAP_BINARY",
3715
 
 
 
 
 
3716
  "CROSS_ENTROPY_LOSS",
3717
  "CROSS_ENTROPY_LOSS_BACK",
3718
  };
3719
 
3720
- static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3721
 
3722
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3723
  "none",
@@ -3785,11 +3833,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3785
  "f(x)",
3786
  "f(x,y)",
3787
 
 
 
 
 
3788
  "cross_entropy_loss(x,y)",
3789
  "cross_entropy_loss_back(x,y)",
3790
  };
3791
 
3792
- static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3793
 
3794
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3795
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3820,12 +3872,31 @@ struct ggml_context_container {
3820
  struct ggml_context context;
3821
  };
3822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3823
  //
3824
  // ggml state
3825
  //
3826
 
3827
  struct ggml_state {
3828
  struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
 
3829
  };
3830
 
3831
  // global state
@@ -3850,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
3850
  atomic_fetch_sub(&g_state_barrier, 1);
3851
  }
3852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3853
  ////////////////////////////////////////////////////////////////////////////////
3854
 
3855
  void ggml_print_object(const struct ggml_object * obj) {
@@ -4106,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4106
 
4107
  g_state = (struct ggml_state) {
4108
  /*.contexts =*/ { { 0 } },
 
 
 
 
4109
  };
4110
 
4111
  for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@@ -6634,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
6634
  int n_past,
6635
  int n_dims,
6636
  int mode,
 
6637
  bool inplace) {
6638
  GGML_ASSERT(n_past >= 0);
6639
  bool is_node = false;
@@ -6646,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
6646
 
6647
  ggml_scratch_save(ctx);
6648
 
6649
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6650
 
6651
  ((int32_t *) b->data)[0] = n_past;
6652
  ((int32_t *) b->data)[1] = n_dims;
6653
  ((int32_t *) b->data)[2] = mode;
 
6654
 
6655
  ggml_scratch_load(ctx);
6656
 
@@ -6667,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
6667
  struct ggml_tensor * a,
6668
  int n_past,
6669
  int n_dims,
6670
- int mode) {
6671
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
 
6672
  }
6673
 
6674
  struct ggml_tensor * ggml_rope_inplace(
@@ -6676,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
6676
  struct ggml_tensor * a,
6677
  int n_past,
6678
  int n_dims,
6679
- int mode) {
6680
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
 
6681
  }
6682
 
6683
  // ggml_rope_back
@@ -7094,9 +7242,14 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7094
  is_node = true;
7095
  }
7096
 
 
 
 
 
7097
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7098
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7099
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
7100
 
7101
  result->op = GGML_OP_MAP_UNARY;
7102
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7136,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7136
  is_node = true;
7137
  }
7138
 
 
 
 
 
7139
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7140
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7141
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
7142
 
7143
  result->op = GGML_OP_MAP_BINARY;
7144
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7165,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7165
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
7166
  }
7167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7168
  // ggml_cross_entropy_loss
7169
 
7170
  struct ggml_tensor * ggml_cross_entropy_loss(
@@ -12142,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
12142
  const struct ggml_tensor * src1,
12143
  struct ggml_tensor * dst) {
12144
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12145
- GGML_ASSERT(ggml_nelements(src1) == 3);
12146
 
12147
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12148
  return;
@@ -12151,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
12151
  const int n_past = ((int32_t *) src1->data)[0];
12152
  const int n_dims = ((int32_t *) src1->data)[1];
12153
  const int mode = ((int32_t *) src1->data)[2];
 
12154
 
12155
  assert(n_past >= 0);
12156
 
@@ -12195,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
12195
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
12196
 
12197
  const bool is_neox = mode & 2;
 
12198
 
12199
  for (int64_t i3 = 0; i3 < ne3; i3++) {
12200
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -12205,7 +12509,35 @@ static void ggml_compute_forward_rope_f32(
12205
 
12206
  float theta = (float)p;
12207
 
12208
- if (!is_neox) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12209
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12210
  const float cos_theta = cosf(theta);
12211
  const float sin_theta = sinf(theta);
@@ -12255,7 +12587,7 @@ static void ggml_compute_forward_rope_f16(
12255
  const struct ggml_tensor * src1,
12256
  struct ggml_tensor * dst) {
12257
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12258
- GGML_ASSERT(ggml_nelements(src1) == 3);
12259
 
12260
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12261
  return;
@@ -12264,6 +12596,7 @@ static void ggml_compute_forward_rope_f16(
12264
  const int n_past = ((int32_t *) src1->data)[0];
12265
  const int n_dims = ((int32_t *) src1->data)[1];
12266
  const int mode = ((int32_t *) src1->data)[2];
 
12267
 
12268
  assert(n_past >= 0);
12269
 
@@ -12308,6 +12641,7 @@ static void ggml_compute_forward_rope_f16(
12308
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
12309
 
12310
  const bool is_neox = mode & 2;
 
12311
 
12312
  for (int64_t i3 = 0; i3 < ne3; i3++) {
12313
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -12318,7 +12652,35 @@ static void ggml_compute_forward_rope_f16(
12318
 
12319
  float theta = (float)p;
12320
 
12321
- if (!is_neox) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12322
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12323
  const float cos_theta = cosf(theta);
12324
  const float sin_theta = sinf(theta);
@@ -12404,6 +12766,7 @@ static void ggml_compute_forward_rope_back_f32(
12404
  const int n_past = ((int32_t *) src1->data)[0];
12405
  const int n_dims = ((int32_t *) src1->data)[1];
12406
  const int mode = ((int32_t *) src1->data)[2];
 
12407
 
12408
  assert(n_past >= 0);
12409
 
@@ -12457,6 +12820,9 @@ static void ggml_compute_forward_rope_back_f32(
12457
  float theta = (float)p;
12458
 
12459
  if (!is_neox) {
 
 
 
12460
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12461
  const float cos_theta = cosf(theta);
12462
  const float sin_theta = sinf(theta);
@@ -12517,6 +12883,7 @@ static void ggml_compute_forward_rope_back_f16(
12517
  const int n_past = ((int32_t *) src1->data)[0];
12518
  const int n_dims = ((int32_t *) src1->data)[1];
12519
  const int mode = ((int32_t *) src1->data)[2];
 
12520
 
12521
  assert(n_past >= 0);
12522
 
@@ -12570,6 +12937,9 @@ static void ggml_compute_forward_rope_back_f16(
12570
  float theta = (float)p;
12571
 
12572
  if (!is_neox) {
 
 
 
12573
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12574
  const float cos_theta = cosf(theta);
12575
  const float sin_theta = sinf(theta);
@@ -13210,8 +13580,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13210
  const int nk1 = ne01;
13211
 
13212
  // size of the convolution row - the kernel size unrolled across all channels
13213
- // round-up so it is more suitable for SIMD
13214
- const int ew0 = ggml_up32(nk0*nk1*ne02);
13215
 
13216
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13217
  GGML_ASSERT(nb10 == sizeof(float));
@@ -14621,6 +14990,114 @@ static void ggml_compute_forward_map_binary(
14621
  }
14622
  }
14623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14624
  // ggml_compute_forward_cross_entropy_loss
14625
 
14626
  static void ggml_compute_forward_cross_entropy_loss_f32(
@@ -15158,6 +15635,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15158
  ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
15159
  }
15160
  break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15161
  case GGML_OP_CROSS_ENTROPY_LOSS:
15162
  {
15163
  ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
@@ -15766,17 +16261,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15766
  {
15767
  if (src0->grad) {
15768
  assert(src1->type == GGML_TYPE_I32);
15769
- assert(ggml_nelements(src1) == 3);
15770
  const int n_past = ((int32_t *) src1->data)[0];
15771
  const int n_dims = ((int32_t *) src1->data)[1];
15772
  const int mode = ((int32_t *) src1->data)[2];
 
15773
  src0->grad = ggml_add_impl(ctx,
15774
  src0->grad,
15775
  ggml_rope(ctx,
15776
  tensor->grad,
15777
  n_past,
15778
  n_dims,
15779
- mode),
 
15780
  inplace);
15781
  }
15782
  if (src1->grad) {
@@ -15964,6 +16461,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15964
  case GGML_OP_WIN_UNPART:
15965
  case GGML_OP_MAP_UNARY:
15966
  case GGML_OP_MAP_BINARY:
 
 
 
15967
  {
15968
  GGML_ASSERT(false); // not supported
15969
  } break;
@@ -16198,68 +16698,173 @@ typedef pthread_t ggml_thread_t;
16198
 
16199
  #endif
16200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16201
  struct ggml_compute_state_shared {
16202
- ggml_lock_t spin;
 
 
 
16203
 
16204
  int n_threads;
16205
 
16206
  // synchronization primitives
16207
- atomic_int n_ready;
16208
- atomic_bool has_work;
16209
- atomic_bool stop; // stop all threads
16210
  };
16211
 
16212
  struct ggml_compute_state {
16213
  ggml_thread_t thrd;
16214
-
16215
- struct ggml_compute_params params;
16216
- struct ggml_tensor * node;
16217
-
16218
  struct ggml_compute_state_shared * shared;
16219
  };
16220
 
 
 
 
 
 
 
 
 
 
16221
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16222
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
16223
 
16224
  const int n_threads = state->shared->n_threads;
 
 
 
16225
 
16226
  while (true) {
16227
- if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
16228
- atomic_store(&state->shared->has_work, false);
16229
- } else {
16230
- while (atomic_load(&state->shared->has_work)) {
16231
- if (atomic_load(&state->shared->stop)) {
16232
- return 0;
16233
- }
16234
- ggml_lock_lock (&state->shared->spin);
16235
- ggml_lock_unlock(&state->shared->spin);
 
 
 
 
 
 
 
 
16236
  }
16237
- }
16238
 
16239
- atomic_fetch_sub(&state->shared->n_ready, 1);
 
 
 
 
16240
 
16241
- // wait for work
16242
- while (!atomic_load(&state->shared->has_work)) {
16243
- if (atomic_load(&state->shared->stop)) {
16244
- return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16245
  }
16246
- ggml_lock_lock (&state->shared->spin);
16247
- ggml_lock_unlock(&state->shared->spin);
 
 
 
 
 
 
 
 
16248
  }
16249
 
16250
  // check if we should stop
16251
- if (atomic_load(&state->shared->stop)) {
16252
- break;
16253
- }
16254
 
16255
- if (state->node) {
16256
- if (state->params.ith < state->params.nth) {
16257
- ggml_compute_forward(&state->params, state->node);
16258
- }
16259
 
16260
- state->node = NULL;
16261
- } else {
16262
- break;
 
 
 
 
 
 
 
16263
  }
16264
  }
16265
 
@@ -16270,39 +16875,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16270
  const int n_threads = cgraph->n_threads;
16271
 
16272
  struct ggml_compute_state_shared state_shared = {
16273
- /*.spin =*/ GGML_LOCK_INITIALIZER,
16274
- /*.n_threads =*/ n_threads,
16275
- /*.n_ready =*/ 0,
16276
- /*.has_work =*/ false,
16277
- /*.stop =*/ false,
 
16278
  };
16279
- struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
16280
-
16281
- // create thread pool
16282
- if (n_threads > 1) {
16283
- ggml_lock_init(&state_shared.spin);
16284
-
16285
- atomic_store(&state_shared.has_work, true);
16286
-
16287
- for (int j = 0; j < n_threads - 1; j++) {
16288
- workers[j] = (struct ggml_compute_state) {
16289
- .thrd = 0,
16290
- .params = {
16291
- .type = GGML_TASK_COMPUTE,
16292
- .ith = j + 1,
16293
- .nth = n_threads,
16294
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16295
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16296
- },
16297
- .node = NULL,
16298
- .shared = &state_shared,
16299
- };
16300
-
16301
- int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
16302
- GGML_ASSERT(rc == 0);
16303
- UNUSED(rc);
16304
- }
16305
- }
16306
 
16307
  // initialize tasks + work buffer
16308
  {
@@ -16446,7 +17026,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16446
  } break;
16447
  case GGML_OP_SCALE:
16448
  {
16449
- node->n_tasks = n_threads;
16450
  } break;
16451
  case GGML_OP_SET:
16452
  case GGML_OP_CONT:
@@ -16605,6 +17185,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16605
  case GGML_OP_WIN_UNPART:
16606
  case GGML_OP_MAP_UNARY:
16607
  case GGML_OP_MAP_BINARY:
 
 
 
16608
  {
16609
  node->n_tasks = 1;
16610
  } break;
@@ -16647,166 +17230,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16647
  }
16648
  }
16649
 
16650
- const int64_t perf_start_cycles = ggml_perf_cycles();
16651
- const int64_t perf_start_time_us = ggml_perf_time_us();
16652
-
16653
- for (int i = 0; i < cgraph->n_nodes; i++) {
16654
- GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
16655
-
16656
- struct ggml_tensor * node = cgraph->nodes[i];
16657
-
16658
- // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
16659
- //if (node->grad == NULL && node->perf_runs > 0) {
16660
- // continue;
16661
- //}
16662
-
16663
- const int64_t perf_node_start_cycles = ggml_perf_cycles();
16664
- const int64_t perf_node_start_time_us = ggml_perf_time_us();
16665
-
16666
- // INIT
16667
- struct ggml_compute_params params = {
16668
- /*.type =*/ GGML_TASK_INIT,
16669
- /*.ith =*/ 0,
16670
- /*.nth =*/ node->n_tasks,
16671
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16672
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16673
- };
16674
-
16675
- ggml_compute_forward(&params, node);
16676
-
16677
- // COMPUTE
16678
- if (node->n_tasks > 1) {
16679
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16680
- atomic_store(&state_shared.has_work, false);
16681
- }
16682
-
16683
- while (atomic_load(&state_shared.has_work)) {
16684
- ggml_lock_lock (&state_shared.spin);
16685
- ggml_lock_unlock(&state_shared.spin);
16686
- }
16687
-
16688
- // launch thread pool
16689
- for (int j = 0; j < n_threads - 1; j++) {
16690
- workers[j].params = (struct ggml_compute_params) {
16691
- .type = GGML_TASK_COMPUTE,
16692
- .ith = j + 1,
16693
- .nth = node->n_tasks,
16694
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16695
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16696
- };
16697
- workers[j].node = node;
16698
- }
16699
-
16700
- atomic_fetch_sub(&state_shared.n_ready, 1);
16701
-
16702
- while (atomic_load(&state_shared.n_ready) > 0) {
16703
- ggml_lock_lock (&state_shared.spin);
16704
- ggml_lock_unlock(&state_shared.spin);
16705
- }
16706
-
16707
- atomic_store(&state_shared.has_work, true);
16708
- }
16709
-
16710
- params.type = GGML_TASK_COMPUTE;
16711
- ggml_compute_forward(&params, node);
16712
-
16713
- // wait for thread pool
16714
- if (node->n_tasks > 1) {
16715
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16716
- atomic_store(&state_shared.has_work, false);
16717
- }
16718
-
16719
- while (atomic_load(&state_shared.has_work)) {
16720
- ggml_lock_lock (&state_shared.spin);
16721
- ggml_lock_unlock(&state_shared.spin);
16722
- }
16723
-
16724
- atomic_fetch_sub(&state_shared.n_ready, 1);
16725
-
16726
- while (atomic_load(&state_shared.n_ready) != 0) {
16727
- ggml_lock_lock (&state_shared.spin);
16728
- ggml_lock_unlock(&state_shared.spin);
16729
- }
16730
- }
16731
-
16732
- // FINALIZE
16733
- if (node->n_tasks > 1) {
16734
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16735
- atomic_store(&state_shared.has_work, false);
16736
- }
16737
-
16738
- while (atomic_load(&state_shared.has_work)) {
16739
- ggml_lock_lock (&state_shared.spin);
16740
- ggml_lock_unlock(&state_shared.spin);
16741
- }
16742
-
16743
- // launch thread pool
16744
- for (int j = 0; j < n_threads - 1; j++) {
16745
- workers[j].params = (struct ggml_compute_params) {
16746
- .type = GGML_TASK_FINALIZE,
16747
- .ith = j + 1,
16748
- .nth = node->n_tasks,
16749
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16750
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16751
- };
16752
- workers[j].node = node;
16753
- }
16754
-
16755
- atomic_fetch_sub(&state_shared.n_ready, 1);
16756
-
16757
- while (atomic_load(&state_shared.n_ready) > 0) {
16758
- ggml_lock_lock (&state_shared.spin);
16759
- ggml_lock_unlock(&state_shared.spin);
16760
- }
16761
 
16762
- atomic_store(&state_shared.has_work, true);
 
16763
  }
 
 
 
16764
 
16765
- params.type = GGML_TASK_FINALIZE;
16766
- ggml_compute_forward(&params, node);
16767
-
16768
- // wait for thread pool
16769
- if (node->n_tasks > 1) {
16770
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16771
- atomic_store(&state_shared.has_work, false);
16772
- }
16773
-
16774
- while (atomic_load(&state_shared.has_work)) {
16775
- ggml_lock_lock (&state_shared.spin);
16776
- ggml_lock_unlock(&state_shared.spin);
16777
- }
16778
-
16779
- atomic_fetch_sub(&state_shared.n_ready, 1);
16780
-
16781
- while (atomic_load(&state_shared.n_ready) != 0) {
16782
- ggml_lock_lock (&state_shared.spin);
16783
- ggml_lock_unlock(&state_shared.spin);
16784
- }
16785
- }
16786
 
16787
- // performance stats (node)
16788
- {
16789
- int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
16790
- int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
16791
 
16792
- node->perf_runs++;
16793
- node->perf_cycles += perf_cycles_cur;
16794
- node->perf_time_us += perf_time_us_cur;
16795
- }
16796
- }
16797
 
16798
  // join thread pool
16799
  if (n_threads > 1) {
16800
- atomic_store(&state_shared.stop, true);
16801
- atomic_store(&state_shared.has_work, true);
16802
-
16803
- for (int j = 0; j < n_threads - 1; j++) {
16804
- int rc = ggml_thread_join(workers[j].thrd, NULL);
16805
  GGML_ASSERT(rc == 0);
16806
- UNUSED(rc);
16807
  }
16808
-
16809
- ggml_lock_destroy(&state_shared.spin);
16810
  }
16811
 
16812
  // performance stats (graph)
 
1
+ #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
2
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
3
 
4
  #include "ggml.h"
5
 
 
91
  #include <stdatomic.h>
92
 
93
  typedef void* thread_ret_t;
94
+
95
+ #include <sys/types.h>
96
+ #include <sys/stat.h>
97
+ #include <unistd.h>
98
+
99
  #endif
100
 
101
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 
124
  #define GGML_SOFT_MAX_UNROLL 4
125
  #define GGML_VEC_DOT_UNROLL 2
126
 
127
+ //
128
+ // logging
129
+ //
130
+
131
+ #if (GGML_DEBUG >= 1)
132
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
133
+ #else
134
+ #define GGML_PRINT_DEBUG(...)
135
+ #endif
136
+
137
+ #if (GGML_DEBUG >= 5)
138
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
139
+ #else
140
+ #define GGML_PRINT_DEBUG_5(...)
141
+ #endif
142
+
143
+ #if (GGML_DEBUG >= 10)
144
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
145
+ #else
146
+ #define GGML_PRINT_DEBUG_10(...)
147
+ #endif
148
+
149
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
150
+
151
  #ifdef GGML_USE_ACCELERATE
152
  // uncomment to use vDSP for soft max computation
153
  // note: not sure if it is actually faster
 
160
  #define GGML_MEM_ALIGN 16
161
  #endif
162
 
163
+ //
164
+ // logging
165
+ //
166
+
167
+ #if (GGML_DEBUG >= 1)
168
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
169
+ #else
170
+ #define GGML_PRINT_DEBUG(...)
171
+ #endif
172
+
173
+ #if (GGML_DEBUG >= 5)
174
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
175
+ #else
176
+ #define GGML_PRINT_DEBUG_5(...)
177
+ #endif
178
+
179
+ #if (GGML_DEBUG >= 10)
180
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
181
+ #else
182
+ #define GGML_PRINT_DEBUG_10(...)
183
+ #endif
184
+
185
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
186
+
187
+ //
188
+ // end of logging block
189
+ //
190
+
191
  #if defined(_MSC_VER) || defined(__MINGW32__)
192
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
193
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
 
201
  #endif
202
  if (result != 0) {
203
  // Handle allocation failure
204
+ const char *error_desc = "unknown allocation error";
205
+ switch (result) {
206
+ case EINVAL:
207
+ error_desc = "invalid alignment value";
208
+ break;
209
+ case ENOMEM:
210
+ error_desc = "insufficient memory";
211
+ break;
212
+ }
213
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
214
+ __func__, error_desc, size/(1024.0*1024.0));
215
  return NULL;
216
  }
217
  return aligned_memory;
 
488
  }
489
  }
490
 
 
491
  //
492
  // timing
493
  //
 
550
  #define ggml_perf_cycles_per_ms() 0
551
  #endif
552
 
553
+
554
  //
555
  // cache line
556
  //
 
3598
  *s = 1.f/(*s);
3599
  }
3600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3601
  //
3602
  // data types
3603
  //
 
3757
  "MAP_UNARY",
3758
  "MAP_BINARY",
3759
 
3760
+ "MAP_CUSTOM1",
3761
+ "MAP_CUSTOM2",
3762
+ "MAP_CUSTOM3",
3763
+
3764
  "CROSS_ENTROPY_LOSS",
3765
  "CROSS_ENTROPY_LOSS_BACK",
3766
  };
3767
 
3768
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3769
 
3770
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3771
  "none",
 
3833
  "f(x)",
3834
  "f(x,y)",
3835
 
3836
+ "custom(x)",
3837
+ "custom(x,y)",
3838
+ "custom(x,y,z)",
3839
+
3840
  "cross_entropy_loss(x,y)",
3841
  "cross_entropy_loss_back(x,y)",
3842
  };
3843
 
3844
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3845
 
3846
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3847
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
3872
  struct ggml_context context;
3873
  };
3874
 
3875
+ //
3876
+ // NUMA support
3877
+ //
3878
+
3879
+ #define GGML_NUMA_MAX_NODES 8
3880
+ #define GGML_NUMA_MAX_CPUS 512
3881
+
3882
+ struct ggml_numa_node {
3883
+ uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
3884
+ uint32_t n_cpus;
3885
+ };
3886
+
3887
+ struct ggml_numa_nodes {
3888
+ struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
3889
+ uint32_t n_nodes;
3890
+ uint32_t total_cpus; // hardware threads on system
3891
+ };
3892
+
3893
  //
3894
  // ggml state
3895
  //
3896
 
3897
  struct ggml_state {
3898
  struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
3899
+ struct ggml_numa_nodes numa;
3900
  };
3901
 
3902
  // global state
 
3921
  atomic_fetch_sub(&g_state_barrier, 1);
3922
  }
3923
 
3924
+ void ggml_numa_init(void) {
3925
+ if (g_state.numa.n_nodes > 0) {
3926
+ fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
3927
+
3928
+ return;
3929
+ }
3930
+
3931
+ #ifdef __linux__
3932
+ struct stat st;
3933
+ char path[256];
3934
+ int rv;
3935
+
3936
+ // enumerate nodes
3937
+ while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
3938
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
3939
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3940
+ if (stat(path, &st) != 0) { break; }
3941
+ ++g_state.numa.n_nodes;
3942
+ }
3943
+
3944
+ // enumerate CPUs
3945
+ while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
3946
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
3947
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3948
+ if (stat(path, &st) != 0) { break; }
3949
+ ++g_state.numa.total_cpus;
3950
+ }
3951
+
3952
+ GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
3953
+
3954
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
3955
+ g_state.numa.n_nodes = 0;
3956
+ return;
3957
+ }
3958
+
3959
+ for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
3960
+ struct ggml_numa_node * node = &g_state.numa.nodes[n];
3961
+ GGML_PRINT_DEBUG("CPUs on node %u:", n);
3962
+ node->n_cpus = 0;
3963
+ for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
3964
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
3965
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3966
+ if (stat(path, &st) == 0) {
3967
+ node->cpus[node->n_cpus++] = c;
3968
+ GGML_PRINT_DEBUG(" %u", c);
3969
+ }
3970
+ }
3971
+ GGML_PRINT_DEBUG("\n");
3972
+ }
3973
+
3974
+ if (ggml_is_numa()) {
3975
+ FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
3976
+ if (fptr != NULL) {
3977
+ char buf[42];
3978
+ if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
3979
+ GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
3980
+ }
3981
+ fclose(fptr);
3982
+ }
3983
+ }
3984
+ #else
3985
+ // TODO
3986
+ #endif
3987
+ }
3988
+
3989
+ bool ggml_is_numa(void) {
3990
+ return g_state.numa.n_nodes > 1;
3991
+ }
3992
+
3993
  ////////////////////////////////////////////////////////////////////////////////
3994
 
3995
  void ggml_print_object(const struct ggml_object * obj) {
 
4246
 
4247
  g_state = (struct ggml_state) {
4248
  /*.contexts =*/ { { 0 } },
4249
+ /*.numa =*/ {
4250
+ .n_nodes = 0,
4251
+ .total_cpus = 0,
4252
+ },
4253
  };
4254
 
4255
  for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
 
6778
  int n_past,
6779
  int n_dims,
6780
  int mode,
6781
+ int n_ctx,
6782
  bool inplace) {
6783
  GGML_ASSERT(n_past >= 0);
6784
  bool is_node = false;
 
6791
 
6792
  ggml_scratch_save(ctx);
6793
 
6794
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6795
 
6796
  ((int32_t *) b->data)[0] = n_past;
6797
  ((int32_t *) b->data)[1] = n_dims;
6798
  ((int32_t *) b->data)[2] = mode;
6799
+ ((int32_t *) b->data)[3] = n_ctx;
6800
 
6801
  ggml_scratch_load(ctx);
6802
 
 
6813
  struct ggml_tensor * a,
6814
  int n_past,
6815
  int n_dims,
6816
+ int mode,
6817
+ int n_ctx) {
6818
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
6819
  }
6820
 
6821
  struct ggml_tensor * ggml_rope_inplace(
 
6823
  struct ggml_tensor * a,
6824
  int n_past,
6825
  int n_dims,
6826
+ int mode,
6827
+ int n_ctx) {
6828
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
6829
  }
6830
 
6831
  // ggml_rope_back
 
7242
  is_node = true;
7243
  }
7244
 
7245
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7246
+
7247
+ ggml_scratch_save(ctx);
7248
+
7249
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7250
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7251
+
7252
+ ggml_scratch_load(ctx);
7253
 
7254
  result->op = GGML_OP_MAP_UNARY;
7255
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
7289
  is_node = true;
7290
  }
7291
 
7292
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7293
+
7294
+ ggml_scratch_save(ctx);
7295
+
7296
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7297
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7298
+
7299
+ ggml_scratch_load(ctx);
7300
 
7301
  result->op = GGML_OP_MAP_BINARY;
7302
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
7323
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
7324
  }
7325
 
7326
+ // ggml_map_custom1
7327
+
7328
+ struct ggml_tensor * ggml_map_custom1_impl_f32(
7329
+ struct ggml_context * ctx,
7330
+ struct ggml_tensor * a,
7331
+ const ggml_custom1_op_f32_t fun,
7332
+ bool inplace) {
7333
+ bool is_node = false;
7334
+
7335
+ if (!inplace && a->grad) {
7336
+ is_node = true;
7337
+ }
7338
+
7339
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7340
+
7341
+ ggml_scratch_save(ctx);
7342
+
7343
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7344
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7345
+
7346
+ ggml_scratch_load(ctx);
7347
+
7348
+ result->op = GGML_OP_MAP_CUSTOM1;
7349
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7350
+ result->src0 = a;
7351
+ result->opt[0] = addr_tensor;
7352
+
7353
+ return result;
7354
+ }
7355
+
7356
+ struct ggml_tensor * ggml_map_custom1_f32(
7357
+ struct ggml_context * ctx,
7358
+ struct ggml_tensor * a,
7359
+ const ggml_custom1_op_f32_t fun) {
7360
+ return ggml_map_custom1_impl_f32(ctx, a, fun, false);
7361
+ }
7362
+
7363
+ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7364
+ struct ggml_context * ctx,
7365
+ struct ggml_tensor * a,
7366
+ const ggml_custom1_op_f32_t fun) {
7367
+ return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7368
+ }
7369
+
7370
+ // ggml_map_custom2
7371
+
7372
+ struct ggml_tensor * ggml_map_custom2_impl_f32(
7373
+ struct ggml_context * ctx,
7374
+ struct ggml_tensor * a,
7375
+ struct ggml_tensor * b,
7376
+ const ggml_custom2_op_f32_t fun,
7377
+ bool inplace) {
7378
+ bool is_node = false;
7379
+
7380
+ if (!inplace && (a->grad || b->grad)) {
7381
+ is_node = true;
7382
+ }
7383
+
7384
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7385
+
7386
+ ggml_scratch_save(ctx);
7387
+
7388
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7389
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7390
+
7391
+ ggml_scratch_load(ctx);
7392
+
7393
+ result->op = GGML_OP_MAP_CUSTOM2;
7394
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7395
+ result->src0 = a;
7396
+ result->src1 = b;
7397
+ result->opt[0] = addr_tensor;
7398
+
7399
+ return result;
7400
+ }
7401
+
7402
+ struct ggml_tensor * ggml_map_custom2_f32(
7403
+ struct ggml_context * ctx,
7404
+ struct ggml_tensor * a,
7405
+ struct ggml_tensor * b,
7406
+ const ggml_custom2_op_f32_t fun) {
7407
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
7408
+ }
7409
+
7410
+ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7411
+ struct ggml_context * ctx,
7412
+ struct ggml_tensor * a,
7413
+ struct ggml_tensor * b,
7414
+ const ggml_custom2_op_f32_t fun) {
7415
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7416
+ }
7417
+
7418
+ // ggml_map_custom3
7419
+
7420
+ struct ggml_tensor * ggml_map_custom3_impl_f32(
7421
+ struct ggml_context * ctx,
7422
+ struct ggml_tensor * a,
7423
+ struct ggml_tensor * b,
7424
+ struct ggml_tensor * c,
7425
+ const ggml_custom3_op_f32_t fun,
7426
+ bool inplace) {
7427
+ bool is_node = false;
7428
+
7429
+ if (!inplace && (a->grad || b->grad || c->grad)) {
7430
+ is_node = true;
7431
+ }
7432
+
7433
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7434
+
7435
+ ggml_scratch_save(ctx);
7436
+
7437
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7438
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7439
+
7440
+ ggml_scratch_load(ctx);
7441
+
7442
+ result->op = GGML_OP_MAP_CUSTOM3;
7443
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7444
+ result->src0 = a;
7445
+ result->src1 = b;
7446
+ result->opt[0] = addr_tensor;
7447
+ result->opt[1] = c;
7448
+
7449
+ return result;
7450
+ }
7451
+
7452
+ struct ggml_tensor * ggml_map_custom3_f32(
7453
+ struct ggml_context * ctx,
7454
+ struct ggml_tensor * a,
7455
+ struct ggml_tensor * b,
7456
+ struct ggml_tensor * c,
7457
+ const ggml_custom3_op_f32_t fun) {
7458
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
7459
+ }
7460
+
7461
+ struct ggml_tensor * ggml_map_custom3_inplace_f32(
7462
+ struct ggml_context * ctx,
7463
+ struct ggml_tensor * a,
7464
+ struct ggml_tensor * b,
7465
+ struct ggml_tensor * c,
7466
+ const ggml_custom3_op_f32_t fun) {
7467
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7468
+ }
7469
+
7470
  // ggml_cross_entropy_loss
7471
 
7472
  struct ggml_tensor * ggml_cross_entropy_loss(
 
12444
  const struct ggml_tensor * src1,
12445
  struct ggml_tensor * dst) {
12446
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12447
+ GGML_ASSERT(ggml_nelements(src1) == 4);
12448
 
12449
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12450
  return;
 
12453
  const int n_past = ((int32_t *) src1->data)[0];
12454
  const int n_dims = ((int32_t *) src1->data)[1];
12455
  const int mode = ((int32_t *) src1->data)[2];
12456
+ const int n_ctx = ((int32_t *) src1->data)[3];
12457
 
12458
  assert(n_past >= 0);
12459
 
 
12498
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
12499
 
12500
  const bool is_neox = mode & 2;
12501
+ const bool is_glm = mode & 4;
12502
 
12503
  for (int64_t i3 = 0; i3 < ne3; i3++) {
12504
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
 
12509
 
12510
  float theta = (float)p;
12511
 
12512
+ if (is_glm) {
12513
+ theta = MIN(p, n_ctx - 2);
12514
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12515
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12516
+ const float cos_theta = cosf(theta);
12517
+ const float sin_theta = sinf(theta);
12518
+ const float cos_block_theta = cosf(block_theta);
12519
+ const float sin_block_theta = sinf(block_theta);
12520
+
12521
+ theta *= theta_scale;
12522
+ block_theta *= theta_scale;
12523
+
12524
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12525
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12526
+
12527
+ const float x0 = src[0];
12528
+ const float x1 = src[n_dims/2];
12529
+ const float x2 = src[n_dims];
12530
+ const float x3 = src[n_dims/2*3];
12531
+
12532
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
12533
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
12534
+ dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
12535
+ dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
12536
+ }
12537
+ } else if (!is_neox) {
12538
+ if (n_ctx > GGML_TRAINING_CTX) {
12539
+ theta = theta * GGML_TRAINING_CTX / n_ctx;
12540
+ }
12541
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12542
  const float cos_theta = cosf(theta);
12543
  const float sin_theta = sinf(theta);
 
12587
  const struct ggml_tensor * src1,
12588
  struct ggml_tensor * dst) {
12589
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12590
+ GGML_ASSERT(ggml_nelements(src1) == 4);
12591
 
12592
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12593
  return;
 
12596
  const int n_past = ((int32_t *) src1->data)[0];
12597
  const int n_dims = ((int32_t *) src1->data)[1];
12598
  const int mode = ((int32_t *) src1->data)[2];
12599
+ const int n_ctx = ((int32_t *) src1->data)[3];
12600
 
12601
  assert(n_past >= 0);
12602
 
 
12641
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
12642
 
12643
  const bool is_neox = mode & 2;
12644
+ const bool is_glm = mode & 4;
12645
 
12646
  for (int64_t i3 = 0; i3 < ne3; i3++) {
12647
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
 
12652
 
12653
  float theta = (float)p;
12654
 
12655
+ if (is_glm) {
12656
+ theta = MIN(p, n_ctx - 2);
12657
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12658
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12659
+ const float cos_theta = cosf(theta);
12660
+ const float sin_theta = sinf(theta);
12661
+ const float cos_block_theta = cosf(block_theta);
12662
+ const float sin_block_theta = sinf(block_theta);
12663
+
12664
+ theta *= theta_scale;
12665
+ block_theta *= theta_scale;
12666
+
12667
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12668
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12669
+
12670
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
12671
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12672
+ const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
12673
+ const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
12674
+
12675
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12676
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12677
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
12678
+ dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
12679
+ }
12680
+ } if (!is_neox) {
12681
+ if (n_ctx > GGML_TRAINING_CTX) {
12682
+ theta = theta * GGML_TRAINING_CTX / n_ctx;
12683
+ }
12684
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12685
  const float cos_theta = cosf(theta);
12686
  const float sin_theta = sinf(theta);
 
12766
  const int n_past = ((int32_t *) src1->data)[0];
12767
  const int n_dims = ((int32_t *) src1->data)[1];
12768
  const int mode = ((int32_t *) src1->data)[2];
12769
+ const int n_ctx = ((int32_t *) src1->data)[3];
12770
 
12771
  assert(n_past >= 0);
12772
 
 
12820
  float theta = (float)p;
12821
 
12822
  if (!is_neox) {
12823
+ if (n_ctx > GGML_TRAINING_CTX) {
12824
+ theta = theta * GGML_TRAINING_CTX / n_ctx;
12825
+ }
12826
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12827
  const float cos_theta = cosf(theta);
12828
  const float sin_theta = sinf(theta);
 
12883
  const int n_past = ((int32_t *) src1->data)[0];
12884
  const int n_dims = ((int32_t *) src1->data)[1];
12885
  const int mode = ((int32_t *) src1->data)[2];
12886
+ const int n_ctx = ((int32_t *) src1->data)[3];
12887
 
12888
  assert(n_past >= 0);
12889
 
 
12937
  float theta = (float)p;
12938
 
12939
  if (!is_neox) {
12940
+ if (n_ctx > GGML_TRAINING_CTX) {
12941
+ theta = theta * GGML_TRAINING_CTX / n_ctx;
12942
+ }
12943
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12944
  const float cos_theta = cosf(theta);
12945
  const float sin_theta = sinf(theta);
 
13580
  const int nk1 = ne01;
13581
 
13582
  // size of the convolution row - the kernel size unrolled across all channels
13583
+ const int ew0 = nk0*nk1*ne02;
 
13584
 
13585
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13586
  GGML_ASSERT(nb10 == sizeof(float));
 
14990
  }
14991
  }
14992
 
14993
+ // ggml_compute_forward_map_custom1
14994
+
14995
+ static void ggml_compute_forward_map_custom1_f32(
14996
+ const struct ggml_compute_params * params,
14997
+ const struct ggml_tensor * a,
14998
+ struct ggml_tensor * dst,
14999
+ const ggml_custom1_op_f32_t fun) {
15000
+ assert(params->ith == 0);
15001
+
15002
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15003
+ return;
15004
+ }
15005
+
15006
+ fun(dst, a);
15007
+ }
15008
+
15009
+
15010
+ static void ggml_compute_forward_map_custom1(
15011
+ const struct ggml_compute_params * params,
15012
+ const struct ggml_tensor * a,
15013
+ struct ggml_tensor * dst,
15014
+ const ggml_custom1_op_f32_t fun) {
15015
+ switch (a->type) {
15016
+ case GGML_TYPE_F32:
15017
+ {
15018
+ ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
15019
+ } break;
15020
+ default:
15021
+ {
15022
+ GGML_ASSERT(false);
15023
+ } break;
15024
+ }
15025
+ }
15026
+
15027
+ // ggml_compute_forward_map_custom2
15028
+
15029
+ static void ggml_compute_forward_map_custom2_f32(
15030
+ const struct ggml_compute_params * params,
15031
+ const struct ggml_tensor * a,
15032
+ const struct ggml_tensor * b,
15033
+ struct ggml_tensor * dst,
15034
+ const ggml_custom2_op_f32_t fun) {
15035
+ assert(params->ith == 0);
15036
+
15037
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15038
+ return;
15039
+ }
15040
+
15041
+ fun(dst, a, b);
15042
+ }
15043
+
15044
+
15045
+ static void ggml_compute_forward_map_custom2(
15046
+ const struct ggml_compute_params * params,
15047
+ const struct ggml_tensor * a,
15048
+ const struct ggml_tensor * b,
15049
+ struct ggml_tensor * dst,
15050
+ const ggml_custom2_op_f32_t fun) {
15051
+ switch (a->type) {
15052
+ case GGML_TYPE_F32:
15053
+ {
15054
+ ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
15055
+ } break;
15056
+ default:
15057
+ {
15058
+ GGML_ASSERT(false);
15059
+ } break;
15060
+ }
15061
+ }
15062
+
15063
+ // ggml_compute_forward_map_custom3
15064
+
15065
+ static void ggml_compute_forward_map_custom3_f32(
15066
+ const struct ggml_compute_params * params,
15067
+ const struct ggml_tensor * a,
15068
+ const struct ggml_tensor * b,
15069
+ const struct ggml_tensor * c,
15070
+ struct ggml_tensor * dst,
15071
+ const ggml_custom3_op_f32_t fun) {
15072
+ assert(params->ith == 0);
15073
+
15074
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15075
+ return;
15076
+ }
15077
+
15078
+ fun(dst, a, b, c);
15079
+ }
15080
+
15081
+
15082
+ static void ggml_compute_forward_map_custom3(
15083
+ const struct ggml_compute_params * params,
15084
+ const struct ggml_tensor * a,
15085
+ const struct ggml_tensor * b,
15086
+ const struct ggml_tensor * c,
15087
+ struct ggml_tensor * dst,
15088
+ const ggml_custom3_op_f32_t fun) {
15089
+ switch (a->type) {
15090
+ case GGML_TYPE_F32:
15091
+ {
15092
+ ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
15093
+ } break;
15094
+ default:
15095
+ {
15096
+ GGML_ASSERT(false);
15097
+ } break;
15098
+ }
15099
+ }
15100
+
15101
  // ggml_compute_forward_cross_entropy_loss
15102
 
15103
  static void ggml_compute_forward_cross_entropy_loss_f32(
 
15635
  ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
15636
  }
15637
  break;
15638
+ case GGML_OP_MAP_CUSTOM1:
15639
+ {
15640
+ const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
15641
+ ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
15642
+ }
15643
+ break;
15644
+ case GGML_OP_MAP_CUSTOM2:
15645
+ {
15646
+ const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
15647
+ ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
15648
+ }
15649
+ break;
15650
+ case GGML_OP_MAP_CUSTOM3:
15651
+ {
15652
+ const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
15653
+ ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
15654
+ }
15655
+ break;
15656
  case GGML_OP_CROSS_ENTROPY_LOSS:
15657
  {
15658
  ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
 
16261
  {
16262
  if (src0->grad) {
16263
  assert(src1->type == GGML_TYPE_I32);
16264
+ assert(ggml_nelements(src1) == 4);
16265
  const int n_past = ((int32_t *) src1->data)[0];
16266
  const int n_dims = ((int32_t *) src1->data)[1];
16267
  const int mode = ((int32_t *) src1->data)[2];
16268
+ const int n_ctx = ((int32_t *) src1->data)[3];
16269
  src0->grad = ggml_add_impl(ctx,
16270
  src0->grad,
16271
  ggml_rope(ctx,
16272
  tensor->grad,
16273
  n_past,
16274
  n_dims,
16275
+ mode,
16276
+ n_ctx),
16277
  inplace);
16278
  }
16279
  if (src1->grad) {
 
16461
  case GGML_OP_WIN_UNPART:
16462
  case GGML_OP_MAP_UNARY:
16463
  case GGML_OP_MAP_BINARY:
16464
+ case GGML_OP_MAP_CUSTOM1:
16465
+ case GGML_OP_MAP_CUSTOM2:
16466
+ case GGML_OP_MAP_CUSTOM3:
16467
  {
16468
  GGML_ASSERT(false); // not supported
16469
  } break;
 
16698
 
16699
  #endif
16700
 
16701
+ // Android's libc implementation "bionic" does not support setting affinity
16702
+ #if defined(__linux__) && !defined(__BIONIC__)
16703
+ void set_numa_thread_affinity(int thread_n, int n_threads) {
16704
+ if (!ggml_is_numa()) {
16705
+ return;
16706
+ }
16707
+
16708
+ // run thread on node_num thread_n / (threads per node)
16709
+ const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16710
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
16711
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16712
+
16713
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16714
+ CPU_ZERO_S(setsize, cpus);
16715
+ for (size_t i = 0; i < node->n_cpus; ++i) {
16716
+ CPU_SET_S(node->cpus[i], setsize, cpus);
16717
+ }
16718
+
16719
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16720
+ if (rv) {
16721
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16722
+ strerror(rv));
16723
+ }
16724
+
16725
+ CPU_FREE(cpus);
16726
+ }
16727
+
16728
+ void clear_numa_thread_affinity(void) {
16729
+ if (!ggml_is_numa()) {
16730
+ return;
16731
+ }
16732
+
16733
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16734
+
16735
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16736
+ CPU_ZERO_S(setsize, cpus);
16737
+ for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
16738
+ CPU_SET_S(i, setsize, cpus);
16739
+ }
16740
+
16741
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16742
+ if (rv) {
16743
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16744
+ strerror(rv));
16745
+ }
16746
+
16747
+ CPU_FREE(cpus);
16748
+ }
16749
+ #else
16750
+ // TODO: Windows etc.
16751
+ // (the linux implementation may also work on BSD, someone should test)
16752
+ void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16753
+ void clear_numa_thread_affinity(void) {}
16754
+ #endif
16755
+
16756
  struct ggml_compute_state_shared {
16757
+ struct ggml_cgraph * cgraph;
16758
+
16759
+ int64_t perf_node_start_cycles;
16760
+ int64_t perf_node_start_time_us;
16761
 
16762
  int n_threads;
16763
 
16764
  // synchronization primitives
16765
+ atomic_int n_active; // num active threads
16766
+ atomic_int node_n; // active graph node
 
16767
  };
16768
 
16769
  struct ggml_compute_state {
16770
  ggml_thread_t thrd;
16771
+ int ith;
 
 
 
16772
  struct ggml_compute_state_shared * shared;
16773
  };
16774
 
16775
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16776
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16777
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
16778
+
16779
+ node->perf_runs++;
16780
+ node->perf_cycles += cycles_cur;
16781
+ node->perf_time_us += time_us_cur;
16782
+ }
16783
+
16784
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16785
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16786
+ struct ggml_cgraph * cgraph = state->shared->cgraph;
16787
 
16788
  const int n_threads = state->shared->n_threads;
16789
+ set_numa_thread_affinity(state->ith, n_threads);
16790
+
16791
+ int node_n = -1;
16792
 
16793
  while (true) {
16794
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16795
+ // all other threads are finished and spinning
16796
+ // do finalize and init here so we don't have synchronize again
16797
+ struct ggml_compute_params params = {
16798
+ /*.type =*/ GGML_TASK_FINALIZE,
16799
+ /*.ith =*/ 0,
16800
+ /*.nth =*/ 0,
16801
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16802
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16803
+ };
16804
+
16805
+ if (node_n != -1) {
16806
+ /* FINALIZE */
16807
+ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16808
+ params.nth = node->n_tasks;
16809
+ ggml_compute_forward(&params, node);
16810
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16811
  }
 
16812
 
16813
+ // distribute new work or execute it direct if 1T
16814
+ while (++node_n < cgraph->n_nodes) {
16815
+ GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16816
+
16817
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16818
 
16819
+ state->shared->perf_node_start_cycles = ggml_perf_cycles();
16820
+ state->shared->perf_node_start_time_us = ggml_perf_time_us();
16821
+
16822
+ /* INIT */
16823
+ params.type = GGML_TASK_INIT;
16824
+ params.nth = node->n_tasks;
16825
+ ggml_compute_forward(&params, node);
16826
+
16827
+ if (node->n_tasks == 1) {
16828
+ // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16829
+ // they do something more efficient than spinning (?)
16830
+ params.type = GGML_TASK_COMPUTE;
16831
+ ggml_compute_forward(&params, node);
16832
+
16833
+ params.type = GGML_TASK_FINALIZE;
16834
+ ggml_compute_forward(&params, node);
16835
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16836
+ } else {
16837
+ break;
16838
+ }
16839
  }
16840
+
16841
+ atomic_store(&state->shared->n_active, n_threads);
16842
+ atomic_store(&state->shared->node_n, node_n);
16843
+ } else {
16844
+ // wait for other threads to finish
16845
+ const int last = node_n;
16846
+ do {
16847
+ sched_yield();
16848
+ node_n = atomic_load(&state->shared->node_n);
16849
+ } while (node_n == last);
16850
  }
16851
 
16852
  // check if we should stop
16853
+ if (node_n >= cgraph->n_nodes) break;
 
 
16854
 
16855
+ /* COMPUTE */
16856
+ struct ggml_tensor * node = cgraph->nodes[node_n];
 
 
16857
 
16858
+ struct ggml_compute_params params = {
16859
+ /*.type =*/ GGML_TASK_COMPUTE,
16860
+ /*.ith =*/ state->ith,
16861
+ /*.nth =*/ node->n_tasks,
16862
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16863
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16864
+ };
16865
+
16866
+ if (state->ith < node->n_tasks) {
16867
+ ggml_compute_forward(&params, node);
16868
  }
16869
  }
16870
 
 
16875
  const int n_threads = cgraph->n_threads;
16876
 
16877
  struct ggml_compute_state_shared state_shared = {
16878
+ /*.cgraph =*/ cgraph,
16879
+ /*.perf_node_start_cycles =*/ 0,
16880
+ /*.perf_node_start_time_us =*/ 0,
16881
+ /*.n_threads =*/ n_threads,
16882
+ /*.n_active =*/ n_threads,
16883
+ /*.node_n =*/ -1,
16884
  };
16885
+ struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16886
 
16887
  // initialize tasks + work buffer
16888
  {
 
17026
  } break;
17027
  case GGML_OP_SCALE:
17028
  {
17029
+ node->n_tasks = 1;
17030
  } break;
17031
  case GGML_OP_SET:
17032
  case GGML_OP_CONT:
 
17185
  case GGML_OP_WIN_UNPART:
17186
  case GGML_OP_MAP_UNARY:
17187
  case GGML_OP_MAP_BINARY:
17188
+ case GGML_OP_MAP_CUSTOM1:
17189
+ case GGML_OP_MAP_CUSTOM2:
17190
+ case GGML_OP_MAP_CUSTOM3:
17191
  {
17192
  node->n_tasks = 1;
17193
  } break;
 
17230
  }
17231
  }
17232
 
17233
+ // create thread pool
17234
+ if (n_threads > 1) {
17235
+ for (int j = 1; j < n_threads; ++j) {
17236
+ workers[j] = (struct ggml_compute_state) {
17237
+ .thrd = 0,
17238
+ .ith = j,
17239
+ .shared = &state_shared,
17240
+ };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17241
 
17242
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
17243
+ GGML_ASSERT(rc == 0);
17244
  }
17245
+ }
17246
+ workers[0].ith = 0;
17247
+ workers[0].shared = &state_shared;
17248
 
17249
+ const int64_t perf_start_cycles = ggml_perf_cycles();
17250
+ const int64_t perf_start_time_us = ggml_perf_time_us();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17251
 
17252
+ // this is a work thread too
17253
+ ggml_graph_compute_thread(&workers[0]);
 
 
17254
 
17255
+ // don't leave affinity set on the main thread
17256
+ clear_numa_thread_affinity();
 
 
 
17257
 
17258
  // join thread pool
17259
  if (n_threads > 1) {
17260
+ for (int j = 1; j < n_threads; j++) {
17261
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
 
 
 
17262
  GGML_ASSERT(rc == 0);
 
17263
  }
 
 
17264
  }
17265
 
17266
  // performance stats (graph)
ggml.h CHANGED
@@ -198,9 +198,15 @@
198
  #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
  #define GGML_MAX_OPT 4
201
- #define GGML_MAX_NAME 32
202
  #define GGML_DEFAULT_N_THREADS 4
203
 
 
 
 
 
 
 
204
  #define GGML_ASSERT(x) \
205
  do { \
206
  if (!(x)) { \
@@ -345,6 +351,10 @@ extern "C" {
345
  GGML_OP_MAP_UNARY,
346
  GGML_OP_MAP_BINARY,
347
 
 
 
 
 
348
  GGML_OP_CROSS_ENTROPY_LOSS,
349
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
350
 
@@ -465,6 +475,9 @@ extern "C" {
465
  GGML_API int64_t ggml_cycles(void);
466
  GGML_API int64_t ggml_cycles_per_ms(void);
467
 
 
 
 
468
  GGML_API void ggml_print_object (const struct ggml_object * obj);
469
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
470
 
@@ -1029,13 +1042,15 @@ extern "C" {
1029
  // rotary position embedding
1030
  // if mode & 1 == 1, skip n_past elements
1031
  // if mode & 2 == 1, GPT-NeoX style
 
1032
  // TODO: avoid creating a new tensor every time
1033
  GGML_API struct ggml_tensor * ggml_rope(
1034
  struct ggml_context * ctx,
1035
  struct ggml_tensor * a,
1036
  int n_past,
1037
  int n_dims,
1038
- int mode);
 
1039
 
1040
  // in-place, returns view(a)
1041
  GGML_API struct ggml_tensor * ggml_rope_inplace(
@@ -1043,7 +1058,8 @@ extern "C" {
1043
  struct ggml_tensor * a,
1044
  int n_past,
1045
  int n_dims,
1046
- int mode);
 
1047
 
1048
  // rotary position embedding backward, i.e compute dx from dy
1049
  // a - dy
@@ -1167,21 +1183,73 @@ extern "C" {
1167
  int h0,
1168
  int w);
1169
 
1170
- // Mapping operations
1171
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
 
1172
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1173
 
 
 
 
 
1174
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
1175
  struct ggml_context * ctx,
1176
  struct ggml_tensor * a,
1177
  ggml_unary_op_f32_t fun);
1178
 
 
 
 
 
 
1179
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
1180
  struct ggml_context * ctx,
1181
  struct ggml_tensor * a,
1182
  struct ggml_tensor * b,
1183
  ggml_binary_op_f32_t fun);
1184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1185
  // loss function
1186
 
1187
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
 
198
  #define GGML_MAX_PARAMS 256
199
  #define GGML_MAX_CONTEXTS 64
200
  #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 48
202
  #define GGML_DEFAULT_N_THREADS 4
203
 
204
+ // Maximum training context of the model in use
205
+ // For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
206
+ #ifndef GGML_TRAINING_CTX
207
+ #define GGML_TRAINING_CTX 2176
208
+ #endif
209
+
210
  #define GGML_ASSERT(x) \
211
  do { \
212
  if (!(x)) { \
 
351
  GGML_OP_MAP_UNARY,
352
  GGML_OP_MAP_BINARY,
353
 
354
+ GGML_OP_MAP_CUSTOM1,
355
+ GGML_OP_MAP_CUSTOM2,
356
+ GGML_OP_MAP_CUSTOM3,
357
+
358
  GGML_OP_CROSS_ENTROPY_LOSS,
359
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
360
 
 
475
  GGML_API int64_t ggml_cycles(void);
476
  GGML_API int64_t ggml_cycles_per_ms(void);
477
 
478
+ GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
479
+ GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
480
+
481
  GGML_API void ggml_print_object (const struct ggml_object * obj);
482
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
483
 
 
1042
  // rotary position embedding
1043
  // if mode & 1 == 1, skip n_past elements
1044
  // if mode & 2 == 1, GPT-NeoX style
1045
+ // if mode & 4 == 1, ChatGLM style
1046
  // TODO: avoid creating a new tensor every time
1047
  GGML_API struct ggml_tensor * ggml_rope(
1048
  struct ggml_context * ctx,
1049
  struct ggml_tensor * a,
1050
  int n_past,
1051
  int n_dims,
1052
+ int mode,
1053
+ int n_ctx);
1054
 
1055
  // in-place, returns view(a)
1056
  GGML_API struct ggml_tensor * ggml_rope_inplace(
 
1058
  struct ggml_tensor * a,
1059
  int n_past,
1060
  int n_dims,
1061
+ int mode,
1062
+ int n_ctx);
1063
 
1064
  // rotary position embedding backward, i.e compute dx from dy
1065
  // a - dy
 
1183
  int h0,
1184
  int w);
1185
 
1186
+ // custom operators
1187
+
1188
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1189
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1190
 
1191
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1192
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1193
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1194
+
1195
  GGML_API struct ggml_tensor * ggml_map_unary_f32(
1196
  struct ggml_context * ctx,
1197
  struct ggml_tensor * a,
1198
  ggml_unary_op_f32_t fun);
1199
 
1200
+ GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1201
+ struct ggml_context * ctx,
1202
+ struct ggml_tensor * a,
1203
+ ggml_unary_op_f32_t fun);
1204
+
1205
  GGML_API struct ggml_tensor * ggml_map_binary_f32(
1206
  struct ggml_context * ctx,
1207
  struct ggml_tensor * a,
1208
  struct ggml_tensor * b,
1209
  ggml_binary_op_f32_t fun);
1210
 
1211
+ GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1212
+ struct ggml_context * ctx,
1213
+ struct ggml_tensor * a,
1214
+ struct ggml_tensor * b,
1215
+ ggml_binary_op_f32_t fun);
1216
+
1217
+ GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1218
+ struct ggml_context * ctx,
1219
+ struct ggml_tensor * a,
1220
+ ggml_custom1_op_f32_t fun);
1221
+
1222
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1223
+ struct ggml_context * ctx,
1224
+ struct ggml_tensor * a,
1225
+ ggml_custom1_op_f32_t fun);
1226
+
1227
+ GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1228
+ struct ggml_context * ctx,
1229
+ struct ggml_tensor * a,
1230
+ struct ggml_tensor * b,
1231
+ ggml_custom2_op_f32_t fun);
1232
+
1233
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1234
+ struct ggml_context * ctx,
1235
+ struct ggml_tensor * a,
1236
+ struct ggml_tensor * b,
1237
+ ggml_custom2_op_f32_t fun);
1238
+
1239
+ GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1240
+ struct ggml_context * ctx,
1241
+ struct ggml_tensor * a,
1242
+ struct ggml_tensor * b,
1243
+ struct ggml_tensor * c,
1244
+ ggml_custom3_op_f32_t fun);
1245
+
1246
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1247
+ struct ggml_context * ctx,
1248
+ struct ggml_tensor * a,
1249
+ struct ggml_tensor * b,
1250
+ struct ggml_tensor * c,
1251
+ ggml_custom3_op_f32_t fun);
1252
+
1253
  // loss function
1254
 
1255
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
gpttype_adapter.cpp CHANGED
@@ -377,6 +377,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
377
  //llama_ctx_paran_parts = -1;
378
  llama_ctx_params.seed = -1;
379
  llama_ctx_params.f16_kv = inputs.f16_kv;
 
380
  llama_ctx_params.logits_all = false;
381
  llama_ctx_params.use_mmap = inputs.use_mmap;
382
  llama_ctx_params.use_mlock = inputs.use_mlock;
 
377
  //llama_ctx_paran_parts = -1;
378
  llama_ctx_params.seed = -1;
379
  llama_ctx_params.f16_kv = inputs.f16_kv;
380
+ llama_ctx_params.low_vram = inputs.low_vram;
381
  llama_ctx_params.logits_all = false;
382
  llama_ctx_params.use_mmap = inputs.use_mmap;
383
  llama_ctx_params.use_mlock = inputs.use_mlock;
k_quants.c CHANGED
@@ -261,6 +261,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
261
  return scale;
262
  }
263
 
 
264
  static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
265
  if (j < 4) {
266
  *d = q[j] & 63; *m = q[j + 4] & 63;
@@ -269,6 +270,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
269
  *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
270
  }
271
  }
 
272
 
273
  //========================- 2-bit (de)-quantization
274
 
@@ -330,11 +332,17 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
330
  }
331
  }
332
 
 
333
  for (int j = 0; j < QK_K; j += 128) {
334
  for (int l = 0; l < 32; ++l) {
335
  y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
336
  }
337
  }
 
 
 
 
 
338
 
339
  x += QK_K;
340
 
@@ -352,6 +360,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
352
 
353
  const uint8_t * q = x[i].qs;
354
 
 
355
  int is = 0;
356
  float dl, ml;
357
  for (int n = 0; n < QK_K; n += 128) {
@@ -370,7 +379,19 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int
370
  }
371
  q += 32;
372
  }
373
-
 
 
 
 
 
 
 
 
 
 
 
 
374
  }
375
  }
376
 
@@ -412,6 +433,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
412
  }
413
  }
414
 
 
415
  memset(y[i].scales, 0, 12);
416
  if (max_scale) {
417
  float iscale = -32.f/max_scale;
@@ -445,9 +467,39 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
445
  L[16*j + ii] = l + 4;
446
  }
447
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
  memset(y[i].hmask, 0, QK_K/8);
450
- // We put the high-bit for the 1st 32 quants into bit 0, the next 32 into bit 1, etc.
451
  int m = 0;
452
  uint8_t hm = 1;
453
  for (int j = 0; j < QK_K; ++j) {
@@ -459,19 +511,25 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
459
  m = 0; hm <<= 1;
460
  }
461
  }
 
462
  for (int j = 0; j < QK_K; j += 128) {
463
  for (int l = 0; l < 32; ++l) {
464
  y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
465
  }
466
  }
 
 
 
 
 
467
 
468
  x += QK_K;
469
  }
470
  }
471
 
 
472
  void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
473
  assert(k % QK_K == 0);
474
- assert(QK_K == 256);
475
  const int nb = k / QK_K;
476
 
477
  const uint32_t kmask1 = 0x03030303;
@@ -519,6 +577,39 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int
519
 
520
  }
521
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
  void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
524
  quantize_row_q3_K_reference(x, vy, k);
@@ -563,6 +654,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
563
  }
564
  }
565
 
 
566
  float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
567
  float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
568
  for (int j = 0; j < QK_K/32; ++j) {
@@ -594,9 +686,43 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
594
  L[32*j + ii] = l;
595
  }
596
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  uint8_t * q = y[i].qs;
598
  for (int j = 0; j < QK_K; j += 64) {
599
- for (int l = 0; l < 32; ++l) *q++ = L[j + l] | (L[j + l + 32] << 4);
 
600
  }
601
 
602
  x += QK_K;
@@ -610,11 +736,13 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
610
 
611
  for (int i = 0; i < nb; i++) {
612
 
613
- const float d = ggml_fp16_to_fp32(x[i].d);
614
- const float min = ggml_fp16_to_fp32(x[i].dmin);
615
-
616
  const uint8_t * q = x[i].qs;
617
 
 
 
 
 
 
618
  int is = 0;
619
  uint8_t sc, m;
620
  for (int j = 0; j < QK_K; j += 64) {
@@ -626,6 +754,17 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int
626
  for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
627
  q += 32; is += 2;
628
  }
 
 
 
 
 
 
 
 
 
 
 
629
 
630
  }
631
  }
@@ -653,12 +792,19 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
653
  assert(k % QK_K == 0);
654
  const int nb = k / QK_K;
655
 
 
656
  uint8_t L[QK_K];
657
  float mins[QK_K/32];
658
  float scales[QK_K/32];
 
 
 
 
659
 
660
  for (int i = 0; i < nb; i++) {
661
 
 
 
662
  float max_scale = 0; // as we are deducting the min, scales are always positive
663
  float max_min = 0;
664
  for (int j = 0; j < QK_K/32; ++j) {
@@ -725,6 +871,52 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
725
  m1 <<= 2; m2 <<= 2;
726
  ql += 32;
727
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
 
729
  x += QK_K;
730
 
@@ -737,12 +929,14 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
737
 
738
  for (int i = 0; i < nb; i++) {
739
 
740
- const float d = ggml_fp16_to_fp32(x[i].d);
741
- const float min = ggml_fp16_to_fp32(x[i].dmin);
742
-
743
  const uint8_t * ql = x[i].qs;
744
  const uint8_t * qh = x[i].qh;
745
 
 
 
 
 
 
746
  int is = 0;
747
  uint8_t sc, m;
748
  uint8_t u1 = 1, u2 = 2;
@@ -756,6 +950,21 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
756
  ql += 32; is += 2;
757
  u1 <<= 2; u2 <<= 2;
758
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
759
  }
760
  }
761
 
@@ -823,6 +1032,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
823
 
824
  uint8_t * restrict ql = y[i].ql;
825
  uint8_t * restrict qh = y[i].qh;
 
826
  for (int j = 0; j < QK_K; j += 128) {
827
  for (int l = 0; l < 32; ++l) {
828
  const uint8_t q1 = L[j + l + 0] & 0xF;
@@ -836,6 +1046,16 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
836
  ql += 64;
837
  qh += 32;
838
  }
 
 
 
 
 
 
 
 
 
 
839
 
840
  x += QK_K;
841
 
@@ -854,6 +1074,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
854
  const uint8_t * restrict qh = x[i].qh;
855
  const int8_t * restrict sc = x[i].scales;
856
 
 
857
  for (int n = 0; n < QK_K; n += 128) {
858
  for (int l = 0; l < 32; ++l) {
859
  int is = l/16;
@@ -871,6 +1092,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
871
  qh += 32;
872
  sc += 8;
873
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
874
 
875
  }
876
  }
@@ -1002,6 +1236,7 @@ static inline __m128i get_scale_shuffle(int i) {
1002
  }
1003
  #endif
1004
 
 
1005
  void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1006
 
1007
  const block_q2_K * restrict x = vx;
@@ -1158,6 +1393,112 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1158
 
1159
  *s = hsum_float_8(acc);
1160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1161
  #else
1162
 
1163
  float sumf = 0;
@@ -1201,6 +1542,168 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1201
  #endif
1202
  }
1203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1204
  void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1205
  assert(n % QK_K == 0);
1206
 
@@ -1434,34 +1937,176 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
1434
 
1435
  *s = hsum_float_8(acc);
1436
 
1437
- #else
1438
- // scalar version
1439
- // This function is written like this so the compiler can manage to vectorize most of it
1440
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1441
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1442
- // The ideal situation would be if we could just write the code once, and the compiler would
1443
- // automatically produce the best possible set of machine instructions, instead of us having to manually
1444
- // write vectorized versions for AVX, ARM_NEON, etc.
1445
 
1446
- int8_t aux8[QK_K];
1447
- int16_t aux16[8];
1448
- float sums [8];
1449
- int32_t aux32[8];
1450
- memset(sums, 0, 8*sizeof(float));
1451
 
1452
- uint32_t auxs[4];
1453
- const int8_t * scales = (const int8_t*)auxs;
 
1454
 
1455
- float sumf = 0;
1456
  for (int i = 0; i < nb; ++i) {
 
 
 
1457
  const uint8_t * restrict q3 = x[i].qs;
1458
- const uint8_t * restrict hm = x[i].hmask;
1459
- const int8_t * restrict q8 = y[i].qs;
1460
- memset(aux32, 0, 8*sizeof(int32_t));
1461
- int8_t * restrict a = aux8;
1462
- uint8_t m = 1;
1463
- for (int j = 0; j < QK_K; j += 128) {
1464
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1465
  for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1466
  a += 32; m <<= 1;
1467
  for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
@@ -1501,6 +2146,206 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
1501
 
1502
  }
1503
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1504
  void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1505
  assert(n % QK_K == 0);
1506
 
@@ -1614,9 +2459,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
1614
  const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
1615
  const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
1616
 
1617
- const uint8_t * restrict q4 = x[i].qs;
1618
- const int8_t * restrict q8 = y[i].qs;
1619
-
1620
  memcpy(utmp, x[i].scales, 12);
1621
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1622
  const uint32_t uaux = utmp[1] & kmask1;
@@ -1624,6 +2466,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
1624
  utmp[2] = uaux;
1625
  utmp[0] &= kmask1;
1626
 
 
 
 
1627
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
1628
 
1629
  const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
@@ -1667,6 +2512,88 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
1667
 
1668
  *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
1669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1670
  #else
1671
 
1672
 
@@ -1726,7 +2653,176 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
1726
  *s = sumf;
1727
  #endif
1728
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1729
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1730
  void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1731
  assert(n % QK_K == 0);
1732
 
@@ -1840,18 +2936,23 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
1840
 
1841
  for (int i = 0; i < nb; ++i) {
1842
 
1843
- const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
1844
- const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
1845
-
1846
  const uint8_t * restrict q5 = x[i].qs;
1847
  const int8_t * restrict q8 = y[i].qs;
1848
 
 
 
 
 
1849
  memcpy(utmp, x[i].scales, 12);
1850
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1851
  const uint32_t uaux = utmp[1] & kmask1;
1852
  utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1853
  utmp[2] = uaux;
1854
  utmp[0] &= kmask1;
 
 
 
 
1855
 
1856
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
1857
 
@@ -1876,33 +2977,133 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
1876
  const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
1877
  const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
1878
 
1879
- const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1880
 
1881
- const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
1882
- const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
1883
- const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
1884
- hmask = _mm256_slli_epi16(hmask, 1);
 
 
1885
 
1886
- const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
1887
- const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
1888
- const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1);
1889
- hmask = _mm256_slli_epi16(hmask, 1);
1890
 
1891
- const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1892
- const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1893
 
1894
- __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
1895
- __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
1896
 
1897
- p16_0 = _mm256_madd_epi16(scale_0, p16_0);
1898
- p16_1 = _mm256_madd_epi16(scale_1, p16_1);
1899
 
1900
- sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1901
 
1902
  }
1903
 
1904
  __m256 vd = _mm256_set1_ps(d);
1905
- acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
 
1906
 
1907
  }
1908
 
@@ -1972,8 +3173,169 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
1972
  #endif
1973
  }
1974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1975
 
1976
 
 
1977
  void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1978
  assert(n % QK_K == 0);
1979
 
@@ -2198,6 +3560,124 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
2198
 
2199
  *s = hsum_float_8(acc);
2200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2201
  #else
2202
 
2203
  int8_t aux8[QK_K];
@@ -2242,3 +3722,179 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
2242
  *s = sumf;
2243
  #endif
2244
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  return scale;
262
  }
263
 
264
+ #if QK_K == 256
265
  static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
266
  if (j < 4) {
267
  *d = q[j] & 63; *m = q[j + 4] & 63;
 
270
  *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
271
  }
272
  }
273
+ #endif
274
 
275
  //========================- 2-bit (de)-quantization
276
 
 
332
  }
333
  }
334
 
335
+ #if QK_K == 256
336
  for (int j = 0; j < QK_K; j += 128) {
337
  for (int l = 0; l < 32; ++l) {
338
  y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
339
  }
340
  }
341
+ #else
342
+ for (int l = 0; l < 16; ++l) {
343
+ y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
344
+ }
345
+ #endif
346
 
347
  x += QK_K;
348
 
 
360
 
361
  const uint8_t * q = x[i].qs;
362
 
363
+ #if QK_K == 256
364
  int is = 0;
365
  float dl, ml;
366
  for (int n = 0; n < QK_K; n += 128) {
 
379
  }
380
  q += 32;
381
  }
382
+ #else
383
+ float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
384
+ float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
385
+ float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
386
+ float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
387
+ for (int l = 0; l < 16; ++l) {
388
+ y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1;
389
+ y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2;
390
+ y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3;
391
+ y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4;
392
+ }
393
+ y += QK_K;
394
+ #endif
395
  }
396
  }
397
 
 
433
  }
434
  }
435
 
436
+ #if QK_K == 256
437
  memset(y[i].scales, 0, 12);
438
  if (max_scale) {
439
  float iscale = -32.f/max_scale;
 
467
  L[16*j + ii] = l + 4;
468
  }
469
  }
470
+ #else
471
+ if (max_scale) {
472
+ float iscale = -8.f/max_scale;
473
+ for (int j = 0; j < QK_K/16; j+=2) {
474
+ int l1 = nearest_int(iscale*scales[j]);
475
+ l1 = 8 + MAX(-8, MIN(7, l1));
476
+ int l2 = nearest_int(iscale*scales[j+1]);
477
+ l2 = 8 + MAX(-8, MIN(7, l2));
478
+ y[i].scales[j/2] = l1 | (l2 << 4);
479
+ }
480
+ y[i].d = ggml_fp32_to_fp16(1/iscale);
481
+ } else {
482
+ for (int j = 0; j < QK_K/16; j+=2) {
483
+ y[i].scales[j/2] = 0;
484
+ }
485
+ y[i].d = ggml_fp32_to_fp16(0.f);
486
+ }
487
+ for (int j = 0; j < QK_K/16; ++j) {
488
+ int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
489
+ float d = ggml_fp16_to_fp32(y[i].d) * (s - 8);
490
+ if (!d) {
491
+ continue;
492
+ }
493
+ for (int ii = 0; ii < 16; ++ii) {
494
+ int l = nearest_int(x[16*j + ii]/d);
495
+ l = MAX(-4, MIN(3, l));
496
+ L[16*j + ii] = l + 4;
497
+ }
498
+ }
499
+ #endif
500
 
501
  memset(y[i].hmask, 0, QK_K/8);
502
+ // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
503
  int m = 0;
504
  uint8_t hm = 1;
505
  for (int j = 0; j < QK_K; ++j) {
 
511
  m = 0; hm <<= 1;
512
  }
513
  }
514
+ #if QK_K == 256
515
  for (int j = 0; j < QK_K; j += 128) {
516
  for (int l = 0; l < 32; ++l) {
517
  y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
518
  }
519
  }
520
+ #else
521
+ for (int l = 0; l < 16; ++l) {
522
+ y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
523
+ }
524
+ #endif
525
 
526
  x += QK_K;
527
  }
528
  }
529
 
530
+ #if QK_K == 256
531
  void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
532
  assert(k % QK_K == 0);
 
533
  const int nb = k / QK_K;
534
 
535
  const uint32_t kmask1 = 0x03030303;
 
577
 
578
  }
579
  }
580
+ #else
581
+ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
582
+ assert(k % QK_K == 0);
583
+ assert(QK_K == 64);
584
+ const int nb = k / QK_K;
585
+
586
+ for (int i = 0; i < nb; i++) {
587
+
588
+ const float d_all = ggml_fp16_to_fp32(x[i].d);
589
+
590
+ const uint8_t * restrict q = x[i].qs;
591
+ const uint8_t * restrict hm = x[i].hmask;
592
+
593
+ const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
594
+ const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
595
+ const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
596
+ const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
597
+
598
+ for (int l=0; l<8; ++l) {
599
+ uint8_t h = hm[l];
600
+ y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
601
+ y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
602
+ y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
603
+ y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
604
+ y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
605
+ y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
606
+ y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
607
+ y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
608
+ }
609
+ y += QK_K;
610
+ }
611
+ }
612
+ #endif
613
 
614
  void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
615
  quantize_row_q3_K_reference(x, vy, k);
 
654
  }
655
  }
656
 
657
+ #if QK_K == 256
658
  float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
659
  float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
660
  for (int j = 0; j < QK_K/32; ++j) {
 
686
  L[32*j + ii] = l;
687
  }
688
  }
689
+ #else
690
+ const float s_factor = 15.f;
691
+ float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;
692
+ float inv_min = max_min > 0 ? s_factor/max_min : 0.f;
693
+ int d1 = nearest_int(inv_scale*scales[0]);
694
+ int m1 = nearest_int(inv_min*mins[0]);
695
+ int d2 = nearest_int(inv_scale*scales[1]);
696
+ int m2 = nearest_int(inv_min*mins[1]);
697
+ y[i].scales[0] = d1 | (m1 << 4);
698
+ y[i].scales[1] = d2 | (m2 << 4);
699
+ y[i].d[0] = ggml_fp32_to_fp16(max_scale/s_factor);
700
+ y[i].d[1] = ggml_fp32_to_fp16(max_min/s_factor);
701
+
702
+ float sumlx = 0;
703
+ int suml2 = 0;
704
+ for (int j = 0; j < QK_K/32; ++j) {
705
+ const uint8_t sd = y[i].scales[j] & 0xF;
706
+ const uint8_t sm = y[i].scales[j] >> 4;
707
+ const float d = ggml_fp16_to_fp32(y[i].d[0]) * sd;
708
+ if (!d) continue;
709
+ const float m = ggml_fp16_to_fp32(y[i].d[1]) * sm;
710
+ for (int ii = 0; ii < 32; ++ii) {
711
+ int l = nearest_int((x[32*j + ii] + m)/d);
712
+ l = MAX(0, MIN(15, l));
713
+ L[32*j + ii] = l;
714
+ sumlx += (x[32*j + ii] + m)*l*sd;
715
+ suml2 += l*l*sd*sd;
716
+ }
717
+ }
718
+ if (suml2) {
719
+ y[i].d[0] = ggml_fp32_to_fp16(sumlx/suml2);
720
+ }
721
+ #endif
722
  uint8_t * q = y[i].qs;
723
  for (int j = 0; j < QK_K; j += 64) {
724
+ for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
725
+ q += 32;
726
  }
727
 
728
  x += QK_K;
 
736
 
737
  for (int i = 0; i < nb; i++) {
738
 
 
 
 
739
  const uint8_t * q = x[i].qs;
740
 
741
+ #if QK_K == 256
742
+
743
+ const float d = ggml_fp16_to_fp32(x[i].d);
744
+ const float min = ggml_fp16_to_fp32(x[i].dmin);
745
+
746
  int is = 0;
747
  uint8_t sc, m;
748
  for (int j = 0; j < QK_K; j += 64) {
 
754
  for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
755
  q += 32; is += 2;
756
  }
757
+ #else
758
+ const float dall = ggml_fp16_to_fp32(x[i].d[0]);
759
+ const float mall = ggml_fp16_to_fp32(x[i].d[1]);
760
+ const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4);
761
+ const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4);
762
+ for (int l = 0; l < 32; ++l) {
763
+ y[l+ 0] = d1 * (q[l] & 0xF) - m1;
764
+ y[l+32] = d2 * (q[l] >> 4) - m2;
765
+ }
766
+ y += QK_K;
767
+ #endif
768
 
769
  }
770
  }
 
792
  assert(k % QK_K == 0);
793
  const int nb = k / QK_K;
794
 
795
+ #if QK_K == 256
796
  uint8_t L[QK_K];
797
  float mins[QK_K/32];
798
  float scales[QK_K/32];
799
+ #else
800
+ int8_t L[QK_K];
801
+ float scales[QK_K/16];
802
+ #endif
803
 
804
  for (int i = 0; i < nb; i++) {
805
 
806
+ #if QK_K == 256
807
+
808
  float max_scale = 0; // as we are deducting the min, scales are always positive
809
  float max_min = 0;
810
  for (int j = 0; j < QK_K/32; ++j) {
 
871
  m1 <<= 2; m2 <<= 2;
872
  ql += 32;
873
  }
874
+ #else
875
+ float max_scale = 0, amax = 0;
876
+ for (int j = 0; j < QK_K/16; ++j) {
877
+ scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
878
+ float abs_scale = fabsf(scales[j]);
879
+ if (abs_scale > amax) {
880
+ amax = abs_scale;
881
+ max_scale = scales[j];
882
+ }
883
+ }
884
+
885
+ float iscale = -128.f/max_scale;
886
+ for (int j = 0; j < QK_K/16; ++j) {
887
+ int l = nearest_int(iscale*scales[j]);
888
+ y[i].scales[j] = MAX(-128, MIN(127, l));
889
+ }
890
+ y[i].d = ggml_fp32_to_fp16(1/iscale);
891
+
892
+ for (int j = 0; j < QK_K/16; ++j) {
893
+ const float d = ggml_fp16_to_fp32(y[i].d) * y[i].scales[j];
894
+ if (!d) continue;
895
+ for (int ii = 0; ii < 16; ++ii) {
896
+ int l = nearest_int(x[16*j + ii]/d);
897
+ l = MAX(-16, MIN(15, l));
898
+ L[16*j + ii] = l + 16;
899
+ }
900
+ }
901
+
902
+ uint8_t * restrict qh = y[i].qh;
903
+ uint8_t * restrict ql = y[i].qs;
904
+ memset(qh, 0, QK_K/8);
905
+
906
+ for (int j = 0; j < 32; ++j) {
907
+ int jm = j%8;
908
+ int is = j/8;
909
+ int l1 = L[j];
910
+ if (l1 > 15) {
911
+ l1 -= 16; qh[jm] |= (1 << is);
912
+ }
913
+ int l2 = L[j + 32];
914
+ if (l2 > 15) {
915
+ l2 -= 16; qh[jm] |= (1 << (4 + is));
916
+ }
917
+ ql[j] = l1 | (l2 << 4);
918
+ }
919
+ #endif
920
 
921
  x += QK_K;
922
 
 
929
 
930
  for (int i = 0; i < nb; i++) {
931
 
 
 
 
932
  const uint8_t * ql = x[i].qs;
933
  const uint8_t * qh = x[i].qh;
934
 
935
+ #if QK_K == 256
936
+
937
+ const float d = ggml_fp16_to_fp32(x[i].d);
938
+ const float min = ggml_fp16_to_fp32(x[i].dmin);
939
+
940
  int is = 0;
941
  uint8_t sc, m;
942
  uint8_t u1 = 1, u2 = 2;
 
950
  ql += 32; is += 2;
951
  u1 <<= 2; u2 <<= 2;
952
  }
953
+ #else
954
+ float d = ggml_fp16_to_fp32(x[i].d);
955
+ const int8_t * restrict s = x[i].scales;
956
+ for (int l = 0; l < 8; ++l) {
957
+ y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
958
+ y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
959
+ y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
960
+ y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
961
+ y[l+32] = d * s[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16));
962
+ y[l+40] = d * s[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16));
963
+ y[l+48] = d * s[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16));
964
+ y[l+56] = d * s[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16));
965
+ }
966
+ y += QK_K;
967
+ #endif
968
  }
969
  }
970
 
 
1032
 
1033
  uint8_t * restrict ql = y[i].ql;
1034
  uint8_t * restrict qh = y[i].qh;
1035
+ #if QK_K == 256
1036
  for (int j = 0; j < QK_K; j += 128) {
1037
  for (int l = 0; l < 32; ++l) {
1038
  const uint8_t q1 = L[j + l + 0] & 0xF;
 
1046
  ql += 64;
1047
  qh += 32;
1048
  }
1049
+ #else
1050
+ for (int l = 0; l < 32; ++l) {
1051
+ const uint8_t q1 = L[l + 0] & 0xF;
1052
+ const uint8_t q2 = L[l + 32] & 0xF;
1053
+ ql[l] = q1 | (q2 << 4);
1054
+ }
1055
+ for (int l = 0; l < 16; ++l) {
1056
+ qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6);
1057
+ }
1058
+ #endif
1059
 
1060
  x += QK_K;
1061
 
 
1074
  const uint8_t * restrict qh = x[i].qh;
1075
  const int8_t * restrict sc = x[i].scales;
1076
 
1077
+ #if QK_K == 256
1078
  for (int n = 0; n < QK_K; n += 128) {
1079
  for (int l = 0; l < 32; ++l) {
1080
  int is = l/16;
 
1092
  qh += 32;
1093
  sc += 8;
1094
  }
1095
+ #else
1096
+ for (int l = 0; l < 16; ++l) {
1097
+ const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1098
+ const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1099
+ const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1100
+ const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1101
+ y[l+ 0] = d * sc[0] * q1;
1102
+ y[l+16] = d * sc[1] * q2;
1103
+ y[l+32] = d * sc[2] * q3;
1104
+ y[l+48] = d * sc[3] * q4;
1105
+ }
1106
+ y += 64;
1107
+ #endif
1108
 
1109
  }
1110
  }
 
1236
  }
1237
  #endif
1238
 
1239
+ #if QK_K == 256
1240
  void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1241
 
1242
  const block_q2_K * restrict x = vx;
 
1393
 
1394
  *s = hsum_float_8(acc);
1395
 
1396
+ #elif defined __AVX__
1397
+
1398
+ const __m128i m3 = _mm_set1_epi8(0x3);
1399
+ const __m128i m4 = _mm_set1_epi8(0xF);
1400
+ const __m128i m2 = _mm_set1_epi8(0x2);
1401
+
1402
+ __m256 acc = _mm256_setzero_ps();
1403
+
1404
+ for (int i = 0; i < nb; ++i) {
1405
+
1406
+ const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
1407
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
1408
+
1409
+ const uint8_t * restrict q2 = x[i].qs;
1410
+ const int8_t * restrict q8 = y[i].qs;
1411
+
1412
+ // load mins and scales from block_q2_K.scales[QK_K/16]
1413
+ const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
1414
+ const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
1415
+ const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
1416
+ const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
1417
+ const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
1418
+
1419
+ // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
1420
+ const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
1421
+ const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1422
+
1423
+ // sumf += -dmin * summs in 32bits*8
1424
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
1425
+
1426
+ const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1427
+ const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
1428
+ const __m128i scales[2] = { scales_0, scales_1 };
1429
+
1430
+ __m128i sumi_0 = _mm_setzero_si128();
1431
+ __m128i sumi_1 = _mm_setzero_si128();
1432
+
1433
+ for (int j = 0; j < QK_K/128; ++j) {
1434
+
1435
+ // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
1436
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1437
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1438
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1439
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1440
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1441
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1442
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1443
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
1444
+
1445
+ // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
1446
+ __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
1447
+ const __m128i q2_0 = _mm_and_si128(q2bits, m3);
1448
+ const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
1449
+ const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
1450
+ const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
1451
+ q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
1452
+ const __m128i q2_1 = _mm_and_si128(q2bits, m3);
1453
+ const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
1454
+ const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
1455
+ const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
1456
+
1457
+ // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
1458
+ __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
1459
+ __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
1460
+ __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
1461
+ __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
1462
+ __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
1463
+ __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
1464
+ __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
1465
+ __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
1466
+
1467
+ // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
1468
+ __m128i shuffle = _mm_set1_epi16(0x0100);
1469
+ p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
1470
+ shuffle = _mm_add_epi16(shuffle, m2);
1471
+ p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
1472
+ shuffle = _mm_add_epi16(shuffle, m2);
1473
+ p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
1474
+ shuffle = _mm_add_epi16(shuffle, m2);
1475
+ p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
1476
+ shuffle = _mm_add_epi16(shuffle, m2);
1477
+ p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
1478
+ shuffle = _mm_add_epi16(shuffle, m2);
1479
+ p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
1480
+ shuffle = _mm_add_epi16(shuffle, m2);
1481
+ p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
1482
+ shuffle = _mm_add_epi16(shuffle, m2);
1483
+ p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
1484
+
1485
+ p0 = _mm_add_epi32(p0, p1);
1486
+ p2 = _mm_add_epi32(p2, p3);
1487
+ p4 = _mm_add_epi32(p4, p5);
1488
+ p6 = _mm_add_epi32(p6, p7);
1489
+
1490
+ // isum in 32bits*4*2
1491
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
1492
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
1493
+ }
1494
+
1495
+ // sumf += dall * isum - dmin * summs in 32bits
1496
+ __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
1497
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1498
+ }
1499
+
1500
+ *s = hsum_float_8(acc);
1501
+
1502
  #else
1503
 
1504
  float sumf = 0;
 
1542
  #endif
1543
  }
1544
 
1545
+ #else
1546
+
1547
+ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1548
+
1549
+ const block_q2_K * restrict x = vx;
1550
+ const block_q8_K * restrict y = vy;
1551
+
1552
+ const int nb = n / QK_K;
1553
+
1554
+ #ifdef __ARM_NEON
1555
+
1556
+ const uint8x16_t m3 = vdupq_n_u8(0x3);
1557
+ const int32x4_t vzero = vdupq_n_s32(0);
1558
+
1559
+ int8x16x4_t q2bytes;
1560
+
1561
+ uint32_t aux32[2];
1562
+ const uint8_t * scales = (const uint8_t *)aux32;
1563
+
1564
+ float sum = 0;
1565
+
1566
+ for (int i = 0; i < nb; ++i) {
1567
+
1568
+ const float d = y[i].d * (float)x[i].d;
1569
+ const float dmin = -y[i].d * (float)x[i].dmin;
1570
+
1571
+ const uint8_t * restrict q2 = x[i].qs;
1572
+ const int8_t * restrict q8 = y[i].qs;
1573
+ const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
1574
+
1575
+ aux32[0] = sc[0] & 0x0f0f0f0f;
1576
+ aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
1577
+
1578
+ sum += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
1579
+
1580
+ int isum1 = 0, isum2 = 0;
1581
+
1582
+ const uint8x16_t q2bits = vld1q_u8(q2);
1583
+
1584
+ const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
1585
+
1586
+ q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
1587
+ q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
1588
+ q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
1589
+ q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
1590
+
1591
+ #if defined(__ARM_FEATURE_DOTPROD)
1592
+ isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
1593
+ isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
1594
+ isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
1595
+ isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
1596
+ #else
1597
+ const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
1598
+ vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));
1599
+ const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
1600
+ vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));
1601
+ isum1 += vaddvq_s16(p1) * scales[0];
1602
+ isum2 += vaddvq_s16(p2) * scales[1];
1603
+
1604
+ const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
1605
+ vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2])));
1606
+ const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
1607
+ vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3])));
1608
+ isum1 += vaddvq_s16(p3) * scales[2];
1609
+ isum2 += vaddvq_s16(p4) * scales[3];
1610
+ #endif
1611
+ sum += d * (isum1 + isum2);
1612
+
1613
+ }
1614
+
1615
+ *s = sum;
1616
+
1617
+ #elif defined __AVX2__
1618
+
1619
+ const __m256i m3 = _mm256_set1_epi8(3);
1620
+
1621
+ __m256 acc = _mm256_setzero_ps();
1622
+
1623
+ uint32_t ud, um;
1624
+ const uint8_t * restrict db = (const uint8_t *)&ud;
1625
+ const uint8_t * restrict mb = (const uint8_t *)&um;
1626
+
1627
+ float summs = 0;
1628
+
1629
+ // TODO: optimize this
1630
+
1631
+ for (int i = 0; i < nb; ++i) {
1632
+
1633
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
1634
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
1635
+
1636
+ const uint8_t * restrict q2 = x[i].qs;
1637
+ const int8_t * restrict q8 = y[i].qs;
1638
+
1639
+ const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
1640
+ ud = (sc[0] >> 0) & 0x0f0f0f0f;
1641
+ um = (sc[0] >> 4) & 0x0f0f0f0f;
1642
+
1643
+ int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
1644
+ summs += dmin * smin;
1645
+
1646
+ const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1647
+ const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1648
+ const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
+
1650
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1651
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
1652
+
1653
+ const __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
1654
+ const __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
1655
+
1656
+ const __m256i p_0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 0));
1657
+ const __m256i p_1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 1));
1658
+ const __m256i p_2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 0));
1659
+ const __m256i p_3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 1));
1660
+
1661
+ acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0), acc);
1662
+ acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1), acc);
1663
+ acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2), acc);
1664
+ acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3), acc);
1665
+ }
1666
+
1667
+ *s = hsum_float_8(acc) + summs;
1668
+
1669
+ #else
1670
+
1671
+ float sumf = 0;
1672
+
1673
+ int isum[4];
1674
+
1675
+ for (int i = 0; i < nb; ++i) {
1676
+
1677
+ const uint8_t * q2 = x[i].qs;
1678
+ const int8_t * q8 = y[i].qs;
1679
+ const uint8_t * sc = x[i].scales;
1680
+
1681
+ int summs = 0;
1682
+ for (int j = 0; j < QK_K/16; ++j) {
1683
+ summs += y[i].bsums[j] * (sc[j] >> 4);
1684
+ }
1685
+
1686
+ const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
1687
+ const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
1688
+
1689
+ isum[0] = isum[1] = isum[2] = isum[3] = 0;
1690
+ for (int l = 0; l < 16; ++l) {
1691
+ isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
1692
+ isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
1693
+ isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
1694
+ isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
1695
+ }
1696
+ for (int l = 0; l < 4; ++l) {
1697
+ isum[l] *= (sc[l] & 0xF);
1698
+ }
1699
+ sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
1700
+ }
1701
+ *s = sumf;
1702
+ #endif
1703
+ }
1704
+ #endif
1705
+
1706
+ #if QK_K == 256
1707
  void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
1708
  assert(n % QK_K == 0);
1709
 
 
1937
 
1938
  *s = hsum_float_8(acc);
1939
 
1940
+ #elif defined __AVX__
 
 
 
 
 
 
 
1941
 
1942
+ const __m128i m3 = _mm_set1_epi8(3);
1943
+ const __m128i mone = _mm_set1_epi8(1);
1944
+ const __m128i m32 = _mm_set1_epi8(32);
1945
+ const __m128i m2 = _mm_set1_epi8(2);
 
1946
 
1947
+ __m256 acc = _mm256_setzero_ps();
1948
+
1949
+ uint32_t *aux;
1950
 
 
1951
  for (int i = 0; i < nb; ++i) {
1952
+
1953
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
1954
+
1955
  const uint8_t * restrict q3 = x[i].qs;
1956
+ const int8_t * restrict q8 = y[i].qs;
1957
+
1958
+ // Set up scales
1959
+ aux = (uint32_t *)x[i].scales;
1960
+ __m128i scales128 = _mm_set_epi32(
1961
+ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
1962
+ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
1963
+ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
1964
+ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
1965
+ scales128 = _mm_sub_epi8(scales128, m32);
1966
+ const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
1967
+ const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
1968
+ const __m128i scales[2] = { scales_0, scales_1 };
1969
+
1970
+ // high bit *128*2 from block_q3_K.hmask[QK_K/8]
1971
+ const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
1972
+ const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
1973
+
1974
+ // integer accumulator
1975
+ __m128i sumi_0 = _mm_setzero_si128();
1976
+ __m128i sumi_1 = _mm_setzero_si128();
1977
+
1978
+ for (int j = 0; j < QK_K/128; ++j) {
1979
+ // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
1980
+ const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
1981
+ const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
1982
+
1983
+ // prepare low and high bits
1984
+ const int bit = j << 2;
1985
+
1986
+ const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
1987
+ const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
1988
+ const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
1989
+ const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
1990
+
1991
+ const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
1992
+ const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
1993
+ const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
1994
+ const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
1995
+
1996
+ const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
1997
+ const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
1998
+ const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
1999
+ const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
2000
+
2001
+ const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
2002
+ const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
2003
+ const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
2004
+ const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
2005
+
2006
+ // load Q8 quants from block_q8_K.qs[QK_K]
2007
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2008
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2009
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2010
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2011
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2012
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2013
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2014
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2015
+
2016
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
2017
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
2018
+ // and 2 if the high bit was set)
2019
+ __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
2020
+ __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
2021
+ __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
2022
+ __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
2023
+ __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
2024
+ __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
2025
+ __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
2026
+ __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
2027
+
2028
+ __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
2029
+ __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
2030
+ __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
2031
+ __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
2032
+ __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
2033
+ __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
2034
+ __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
2035
+ __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
2036
+
2037
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
2038
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
2039
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
2040
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
2041
+ p16_4 = _mm_sub_epi16(p16_4, q8s_4);
2042
+ p16_5 = _mm_sub_epi16(p16_5, q8s_5);
2043
+ p16_6 = _mm_sub_epi16(p16_6, q8s_6);
2044
+ p16_7 = _mm_sub_epi16(p16_7, q8s_7);
2045
+
2046
+ // multiply with scales
2047
+ __m128i shuffle = _mm_set1_epi16(0x0100);
2048
+ p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
2049
+ shuffle = _mm_add_epi16(shuffle, m2);
2050
+ p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
2051
+ shuffle = _mm_add_epi16(shuffle, m2);
2052
+ p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
2053
+ shuffle = _mm_add_epi16(shuffle, m2);
2054
+ p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
2055
+ shuffle = _mm_add_epi16(shuffle, m2);
2056
+ p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
2057
+ shuffle = _mm_add_epi16(shuffle, m2);
2058
+ p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
2059
+ shuffle = _mm_add_epi16(shuffle, m2);
2060
+ p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
2061
+ shuffle = _mm_add_epi16(shuffle, m2);
2062
+ p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
2063
+
2064
+ // accumulate
2065
+ p16_0 = _mm_add_epi32(p16_0, p16_1);
2066
+ p16_2 = _mm_add_epi32(p16_2, p16_3);
2067
+ p16_4 = _mm_add_epi32(p16_4, p16_5);
2068
+ p16_6 = _mm_add_epi32(p16_6, p16_7);
2069
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
2070
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
2071
+
2072
+ }
2073
+
2074
+ // multiply with block scale and accumulate
2075
+ __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2076
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
2077
+
2078
+ }
2079
+
2080
+ *s = hsum_float_8(acc);
2081
+
2082
+ #else
2083
+ // scalar version
2084
+ // This function is written like this so the compiler can manage to vectorize most of it
2085
+ // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
2086
+ // manually vectorized version above. Every other version I tried would run at least 4 times slower.
2087
+ // The ideal situation would be if we could just write the code once, and the compiler would
2088
+ // automatically produce the best possible set of machine instructions, instead of us having to manually
2089
+ // write vectorized versions for AVX, ARM_NEON, etc.
2090
+
2091
+ int8_t aux8[QK_K];
2092
+ int16_t aux16[8];
2093
+ float sums [8];
2094
+ int32_t aux32[8];
2095
+ memset(sums, 0, 8*sizeof(float));
2096
+
2097
+ uint32_t auxs[4];
2098
+ const int8_t * scales = (const int8_t*)auxs;
2099
+
2100
+ float sumf = 0;
2101
+ for (int i = 0; i < nb; ++i) {
2102
+ const uint8_t * restrict q3 = x[i].qs;
2103
+ const uint8_t * restrict hm = x[i].hmask;
2104
+ const int8_t * restrict q8 = y[i].qs;
2105
+ memset(aux32, 0, 8*sizeof(int32_t));
2106
+ int8_t * restrict a = aux8;
2107
+ uint8_t m = 1;
2108
+ for (int j = 0; j < QK_K; j += 128) {
2109
+ for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
2110
  for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2111
  a += 32; m <<= 1;
2112
  for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
 
2146
 
2147
  }
2148
 
2149
+ #else
2150
+
2151
+ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
2152
+ assert(n % QK_K == 0);
2153
+
2154
+ const block_q3_K * restrict x = vx;
2155
+ const block_q8_K * restrict y = vy;
2156
+
2157
+ const int nb = n / QK_K;
2158
+
2159
+ #ifdef __ARM_NEON
2160
+
2161
+ #ifdef __ARM_FEATURE_DOTPROD
2162
+ const int32x4_t vzero = vdupq_n_s32(0);
2163
+ #endif
2164
+
2165
+ const uint8x16_t m3b = vdupq_n_u8(0x3);
2166
+ const uint8x16_t mh = vdupq_n_u8(4);
2167
+
2168
+ int8x16x4_t q3bytes;
2169
+
2170
+ uint16_t aux16[2];
2171
+ int8_t * scales = (int8_t *)aux16;
2172
+
2173
+ float sum = 0;
2174
+
2175
+ for (int i = 0; i < nb; ++i) {
2176
+
2177
+ uint8x16x4_t q3h;
2178
+
2179
+ const uint8x8_t hbits = vld1_u8(x[i].hmask);
2180
+ const uint8x16_t q3bits = vld1q_u8(x[i].qs);
2181
+ const int8x16x4_t q8bytes = vld1q_s8_x4(y[i].qs);
2182
+
2183
+ const uint16_t a = *(const uint16_t *)x[i].scales;
2184
+ aux16[0] = a & 0x0f0f;
2185
+ aux16[1] = (a >> 4) & 0x0f0f;
2186
+
2187
+ for (int j = 0; j < 4; ++j) scales[j] -= 8;
2188
+
2189
+ int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
2190
+
2191
+ const float d = y[i].d * (float)x[i].d;
2192
+
2193
+ const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
2194
+ q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
2195
+ q3h.val[1] = vandq_u8(mh, htmp);
2196
+ q3h.val[2] = vandq_u8(mh, vshrq_n_u8(htmp, 2));
2197
+ q3h.val[3] = vandq_u8(mh, vshrq_n_u8(htmp, 4));
2198
+
2199
+ q3bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q3bits, m3b), q3h.val[0]));
2200
+ q3bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 2), m3b), q3h.val[1]));
2201
+ q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
2202
+ q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
2203
+
2204
+ #if defined(__ARM_FEATURE_DOTPROD)
2205
+ isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
2206
+ isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
2207
+ isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
2208
+ isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
2209
+ #else
2210
+ const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
2211
+ vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0])));
2212
+ const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
2213
+ vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1])));
2214
+ const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
2215
+ vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2])));
2216
+ const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
2217
+ vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3])));
2218
+ isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3];
2219
+ #endif
2220
+
2221
+ sum += d * isum;
2222
+
2223
+ }
2224
+
2225
+ *s = sum;
2226
+
2227
+ #elif defined __AVX2__
2228
+
2229
+ const __m256i m3 = _mm256_set1_epi8(3);
2230
+ const __m256i m1 = _mm256_set1_epi8(1);
2231
+
2232
+ __m256 acc = _mm256_setzero_ps();
2233
+
2234
+ uint64_t aux64;
2235
+
2236
+ uint16_t aux16[2];
2237
+ const int8_t * aux8 = (const int8_t *)aux16;
2238
+
2239
+ for (int i = 0; i < nb; ++i) {
2240
+
2241
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
2242
+
2243
+ const uint8_t * restrict q3 = x[i].qs;
2244
+ const int8_t * restrict q8 = y[i].qs;
2245
+
2246
+ const uint16_t a = *(const uint16_t *)x[i].scales;
2247
+ aux16[0] = a & 0x0f0f;
2248
+ aux16[1] = (a >> 4) & 0x0f0f;
2249
+
2250
+ const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2251
+ const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2252
+
2253
+ memcpy(&aux64, x[i].hmask, 8);
2254
+
2255
+ const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2256
+ __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
2257
+ __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
2258
+ q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
2259
+ q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
2260
+
2261
+ // load low 2 bits
2262
+ const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2263
+
2264
+ // prepare low and high bits
2265
+ const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
2266
+ const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
2267
+ const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
2268
+
2269
+ // load Q8 quants
2270
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
2271
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
2272
+
2273
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
2274
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
2275
+ // and 2 if the high bit was set)
2276
+ const __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
2277
+ const __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
2278
+
2279
+ __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
2280
+ __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
2281
+
2282
+ p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
2283
+ p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
2284
+
2285
+ // multiply with scales
2286
+ p16_0 = _mm256_madd_epi16(scale_0, p16_0);
2287
+ p16_1 = _mm256_madd_epi16(scale_1, p16_1);
2288
+
2289
+ p16_0 = _mm256_add_epi32(p16_0, p16_1);
2290
+
2291
+ // multiply with block scale and accumulate
2292
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16_0), acc);
2293
+
2294
+ }
2295
+
2296
+ *s = hsum_float_8(acc);
2297
+
2298
+ #else
2299
+
2300
+ int8_t aux8[QK_K];
2301
+ int16_t aux16[8];
2302
+ float sums [8];
2303
+ int32_t aux32[8];
2304
+ int32_t scales[4];
2305
+ memset(sums, 0, 8*sizeof(float));
2306
+
2307
+ float sumf = 0;
2308
+ for (int i = 0; i < nb; ++i) {
2309
+ const uint8_t * restrict q3 = x[i].qs;
2310
+ const uint8_t * restrict hm = x[i].hmask;
2311
+ const int8_t * restrict q8 = y[i].qs;
2312
+ int8_t * restrict a = aux8;
2313
+ for (int l = 0; l < 8; ++l) {
2314
+ a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4);
2315
+ a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4);
2316
+ a[l+16] = (int8_t)((q3[l+0] >> 2) & 3) - (hm[l] & 0x04 ? 0 : 4);
2317
+ a[l+24] = (int8_t)((q3[l+8] >> 2) & 3) - (hm[l] & 0x08 ? 0 : 4);
2318
+ a[l+32] = (int8_t)((q3[l+0] >> 4) & 3) - (hm[l] & 0x10 ? 0 : 4);
2319
+ a[l+40] = (int8_t)((q3[l+8] >> 4) & 3) - (hm[l] & 0x20 ? 0 : 4);
2320
+ a[l+48] = (int8_t)((q3[l+0] >> 6) & 3) - (hm[l] & 0x40 ? 0 : 4);
2321
+ a[l+56] = (int8_t)((q3[l+8] >> 6) & 3) - (hm[l] & 0x80 ? 0 : 4);
2322
+ }
2323
+
2324
+ scales[0] = (x[i].scales[0] & 0xF) - 8;
2325
+ scales[1] = (x[i].scales[0] >> 4) - 8;
2326
+ scales[2] = (x[i].scales[1] & 0xF) - 8;
2327
+ scales[3] = (x[i].scales[1] >> 4) - 8;
2328
+
2329
+ memset(aux32, 0, 8*sizeof(int32_t));
2330
+ for (int j = 0; j < QK_K/16; ++j) {
2331
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2332
+ q8 += 8; a += 8;
2333
+ for (int l = 0; l < 8; ++l) aux16[l] += q8[l] * a[l];
2334
+ q8 += 8; a += 8;
2335
+ for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l];
2336
+ }
2337
+ const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
2338
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2339
+ }
2340
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
2341
+ *s = sumf;
2342
+
2343
+ #endif
2344
+
2345
+ }
2346
+ #endif
2347
+
2348
+ #if QK_K == 256
2349
  void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
2350
  assert(n % QK_K == 0);
2351
 
 
2459
  const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
2460
  const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
2461
 
 
 
 
2462
  memcpy(utmp, x[i].scales, 12);
2463
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2464
  const uint32_t uaux = utmp[1] & kmask1;
 
2466
  utmp[2] = uaux;
2467
  utmp[0] &= kmask1;
2468
 
2469
+ const uint8_t * restrict q4 = x[i].qs;
2470
+ const int8_t * restrict q8 = y[i].qs;
2471
+
2472
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
2473
 
2474
  const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
 
2512
 
2513
  *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
2514
 
2515
+ #elif defined __AVX__
2516
+
2517
+ const __m128i m4 = _mm_set1_epi8(0xF);
2518
+ const __m128i m2 = _mm_set1_epi8(0x2);
2519
+
2520
+ __m256 acc = _mm256_setzero_ps();
2521
+ __m128 acc_m = _mm_setzero_ps();
2522
+
2523
+ for (int i = 0; i < nb; ++i) {
2524
+
2525
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
2526
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
2527
+
2528
+ const uint8_t * restrict q4 = x[i].qs;
2529
+ const int8_t * restrict q8 = y[i].qs;
2530
+
2531
+ memcpy(utmp, x[i].scales, 12);
2532
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2533
+ const uint32_t uaux = utmp[1] & kmask1;
2534
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2535
+ utmp[2] = uaux;
2536
+ utmp[0] &= kmask1;
2537
+
2538
+ const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
2539
+ const __m128i scales = _mm_cvtepu8_epi16(utmps);
2540
+ const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
2541
+
2542
+ const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
2543
+ const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
2544
+ const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
2545
+ const __m128i prod = _mm_madd_epi16(mins, q8s);
2546
+ acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
2547
+
2548
+ __m128i sumi_0 = _mm_setzero_si128();
2549
+ __m128i sumi_1 = _mm_setzero_si128();
2550
+
2551
+ __m128i shuffle = _mm_set1_epi16(0x0100);
2552
+ for (int j = 0; j < QK_K/64; ++j) {
2553
+
2554
+ const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
2555
+ shuffle = _mm_add_epi16(shuffle, m2);
2556
+ const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
2557
+ shuffle = _mm_add_epi16(shuffle, m2);
2558
+
2559
+ __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2560
+ const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
2561
+ const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
2562
+ q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
2563
+ const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
2564
+ const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
2565
+
2566
+ const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2567
+ __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
2568
+ p16l = _mm_madd_epi16(scale_l, p16l);
2569
+ sumi_0 = _mm_add_epi32(sumi_0, p16l);
2570
+ const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2571
+ p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
2572
+ p16l = _mm_madd_epi16(scale_l, p16l);
2573
+ sumi_1 = _mm_add_epi32(sumi_1, p16l);
2574
+
2575
+ const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2576
+ __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
2577
+ p16h = _mm_madd_epi16(scale_h, p16h);
2578
+ sumi_0 = _mm_add_epi32(sumi_0, p16h);
2579
+ const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
2580
+ p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
2581
+ p16h = _mm_madd_epi16(scale_h, p16h);
2582
+ sumi_1 = _mm_add_epi32(sumi_1, p16h);
2583
+
2584
+ }
2585
+
2586
+ __m256 vd = _mm256_set1_ps(d);
2587
+ __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2588
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2589
+
2590
+ }
2591
+
2592
+ acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
2593
+ acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
2594
+
2595
+ *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
2596
+
2597
  #else
2598
 
2599
 
 
2653
  *s = sumf;
2654
  #endif
2655
  }
2656
+ #else
2657
+ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
2658
+ assert(n % QK_K == 0);
2659
+
2660
+ const block_q4_K * restrict x = vx;
2661
+ const block_q8_K * restrict y = vy;
2662
+
2663
+ const int nb = n / QK_K;
2664
+
2665
+ #ifdef __ARM_NEON
2666
+
2667
+ const uint8x16_t m4b = vdupq_n_u8(0xf);
2668
+
2669
+ #ifdef __ARM_FEATURE_DOTPROD
2670
+ const int32x4_t mzero = vdupq_n_s32(0);
2671
+ #endif
2672
+
2673
+ float sumf = 0;
2674
+
2675
+ int8x16x2_t q4bytes;
2676
+ int8x16x4_t q8bytes;
2677
+
2678
+ float sum_mins = 0.f;
2679
+
2680
+ uint16_t aux16[2];
2681
+ const uint8_t * restrict scales = (const uint8_t *)aux16;
2682
+
2683
+ for (int i = 0; i < nb; ++i) {
2684
+
2685
+ const uint8_t * restrict q4 = x[i].qs;
2686
+ const int8_t * restrict q8 = y[i].qs;
2687
+
2688
+ const uint16_t * restrict a = (const uint16_t *)x[i].scales;
2689
+ aux16[0] = a[0] & 0x0f0f;
2690
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2691
+
2692
+ const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
2693
+ sum_mins += y[i].d * (float)x[i].d[1] * summi;
2694
+
2695
+ const float d = y[i].d * (float)x[i].d[0];
2696
+
2697
+ const uint8x16x2_t q4bits = vld1q_u8_x2(q4);
2698
+
2699
+ #ifdef __ARM_FEATURE_DOTPROD
2700
+ q8bytes = vld1q_s8_x4(q8);
2701
+ q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
2702
+ q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
2703
+
2704
+ const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
2705
+ const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
2706
+
2707
+ q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
2708
+ q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
2709
+
2710
+ const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
2711
+ const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
2712
+
2713
+ #else
2714
+ q8bytes = vld1q_s8_x4(q8);
2715
+ q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
2716
+ q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
2717
+ const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
2718
+ vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
2719
+ const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
2720
+ vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
2721
+ int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0];
2722
+
2723
+ q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
2724
+ q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
2725
+ const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])),
2726
+ vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2])));
2727
+ const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])),
2728
+ vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3])));
2729
+ int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1];
2730
+
2731
+ #endif
2732
+ sumf += d * (sumi1 + sumi2);
2733
+
2734
+ }
2735
+
2736
+ *s = sumf - sum_mins;
2737
+
2738
+ #elif defined __AVX2__
2739
+
2740
+ const __m256i m4 = _mm256_set1_epi8(0xF);
2741
+
2742
+ __m256 acc = _mm256_setzero_ps();
2743
+
2744
+ float summs = 0;
2745
+
2746
+ uint16_t aux16[2];
2747
+ const uint8_t * scales = (const uint8_t *)aux16;
2748
+
2749
+ for (int i = 0; i < nb; ++i) {
2750
+
2751
+ const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
2752
+ const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
2753
+ const __m256 vd = _mm256_set1_ps(d);
2754
+
2755
+ const uint16_t * a = (const uint16_t *)x[i].scales;
2756
+ aux16[0] = a[0] & 0x0f0f;
2757
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
2758
+
2759
+ summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
2760
+
2761
+ const uint8_t * restrict q4 = x[i].qs;
2762
+ const int8_t * restrict q8 = y[i].qs;
2763
+
2764
+ const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
2765
+ const __m256i q4l = _mm256_and_si256(q4bits, m4);
2766
+ const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
2767
 
2768
+ const __m256i q8l = _mm256_loadu_si256((const __m256i*)(q8+ 0));
2769
+ const __m256i q8h = _mm256_loadu_si256((const __m256i*)(q8+32));
2770
+
2771
+ const __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
2772
+ const __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
2773
+
2774
+ const __m256i p32l = _mm256_madd_epi16(_mm256_set1_epi16(scales[0]), p16l);
2775
+ acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32l), acc);
2776
+
2777
+ const __m256i p32h = _mm256_madd_epi16(_mm256_set1_epi16(scales[1]), p16h);
2778
+ acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32h), acc);
2779
+
2780
+ }
2781
+
2782
+ *s = hsum_float_8(acc) - summs;
2783
+
2784
+ #else
2785
+
2786
+ uint8_t aux8[QK_K];
2787
+ int16_t aux16[16];
2788
+ float sums [8];
2789
+ memset(sums, 0, 8*sizeof(float));
2790
+
2791
+ uint16_t s16[2];
2792
+ const uint8_t * restrict scales = (const uint8_t *)s16;
2793
+
2794
+ float sumf = 0;
2795
+ for (int i = 0; i < nb; ++i) {
2796
+ const uint8_t * restrict q4 = x[i].qs;
2797
+ const int8_t * restrict q8 = y[i].qs;
2798
+ uint8_t * restrict a = aux8;
2799
+ for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF;
2800
+ for (int l = 0; l < 32; ++l) a[l+32] = q4[l] >> 4;
2801
+
2802
+ const uint16_t * restrict b = (const uint16_t *)x[i].scales;
2803
+ s16[0] = b[0] & 0x0f0f;
2804
+ s16[1] = (b[0] >> 4) & 0x0f0f;
2805
+
2806
+ sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
2807
+
2808
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
2809
+
2810
+ for (int j = 0; j < QK_K/32; ++j) {
2811
+ for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
2812
+ q8 += 16; a += 16;
2813
+ for (int l = 0; l < 16; ++l) aux16[l] += q8[l] * a[l];
2814
+ q8 += 16; a += 16;
2815
+ const float dl = d * scales[j];
2816
+ for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[l+8]);
2817
+ }
2818
+ }
2819
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
2820
+ *s = sumf;
2821
+ #endif
2822
+ }
2823
+ #endif
2824
+
2825
+ #if QK_K == 256
2826
  void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
2827
  assert(n % QK_K == 0);
2828
 
 
2936
 
2937
  for (int i = 0; i < nb; ++i) {
2938
 
 
 
 
2939
  const uint8_t * restrict q5 = x[i].qs;
2940
  const int8_t * restrict q8 = y[i].qs;
2941
 
2942
+ #if QK_K == 256
2943
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
2944
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
2945
+
2946
  memcpy(utmp, x[i].scales, 12);
2947
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2948
  const uint32_t uaux = utmp[1] & kmask1;
2949
  utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2950
  utmp[2] = uaux;
2951
  utmp[0] &= kmask1;
2952
+ #else
2953
+ // TODO
2954
+ const float d = 0, dmin = 0;
2955
+ #endif
2956
 
2957
  const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
2958
 
 
2977
  const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
2978
  const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
2979
 
2980
+ const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
2981
+
2982
+ const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
2983
+ const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
2984
+ const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
2985
+ hmask = _mm256_slli_epi16(hmask, 1);
2986
+
2987
+ const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
2988
+ const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
2989
+ const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1);
2990
+ hmask = _mm256_slli_epi16(hmask, 1);
2991
+
2992
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2993
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2994
+
2995
+ __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
2996
+ __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
2997
+
2998
+ p16_0 = _mm256_madd_epi16(scale_0, p16_0);
2999
+ p16_1 = _mm256_madd_epi16(scale_1, p16_1);
3000
+
3001
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
3002
+
3003
+ }
3004
+
3005
+ __m256 vd = _mm256_set1_ps(d);
3006
+ acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
3007
+
3008
+ }
3009
+
3010
+ *s = hsum_float_8(acc) + summs;
3011
+
3012
+ #elif defined __AVX__
3013
+
3014
+ const __m128i m4 = _mm_set1_epi8(0xF);
3015
+ const __m128i mzero = _mm_setzero_si128();
3016
+ const __m128i mone = _mm_set1_epi8(1);
3017
+ const __m128i m2 = _mm_set1_epi8(2);
3018
+
3019
+ __m256 acc = _mm256_setzero_ps();
3020
+
3021
+ float summs = 0.f;
3022
+
3023
+ for (int i = 0; i < nb; ++i) {
3024
+
3025
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
3026
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
3027
+
3028
+ const uint8_t * restrict q5 = x[i].qs;
3029
+ const int8_t * restrict q8 = y[i].qs;
3030
+
3031
+ memcpy(utmp, x[i].scales, 12);
3032
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
3033
+ const uint32_t uaux = utmp[1] & kmask1;
3034
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
3035
+ utmp[2] = uaux;
3036
+ utmp[0] &= kmask1;
3037
+
3038
+ const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
3039
+ const __m128i scales = _mm_cvtepu8_epi16(utmps);
3040
+ const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
3041
 
3042
+ const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
3043
+ const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
3044
+ const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
3045
+ const __m128i prod = _mm_madd_epi16(mins, q8s);
3046
+ const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
3047
+ summs += dmin * _mm_extract_epi32(hsum, 0);
3048
 
3049
+ const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
3050
+ const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
3051
+ __m128i hmask = mone;
 
3052
 
3053
+ __m128i sumi_0 = _mm_setzero_si128();
3054
+ __m128i sumi_1 = _mm_setzero_si128();
3055
 
3056
+ int bit = 0;
 
3057
 
3058
+ __m128i shuffle = _mm_set1_epi16(0x0100);
3059
+ for (int j = 0; j < QK_K/64; ++j) {
3060
 
3061
+ const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
3062
+ shuffle = _mm_add_epi16(shuffle, m2);
3063
+ const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
3064
+ shuffle = _mm_add_epi16(shuffle, m2);
3065
+
3066
+ const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
3067
+ const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
3068
+
3069
+ __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
3070
+ __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
3071
+ __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
3072
+ __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
3073
+ __m128i q5_0 = _mm_add_epi8(q5l_0, q5h_0);
3074
+ __m128i q5_1 = _mm_add_epi8(q5l_1, q5h_1);
3075
+ hmask = _mm_slli_epi16(hmask, 1);
3076
+
3077
+ __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3078
+ __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3079
+ __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
3080
+ __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
3081
+ p16_0 = _mm_madd_epi16(scale_0, p16_0);
3082
+ p16_1 = _mm_madd_epi16(scale_0, p16_1);
3083
+
3084
+ q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
3085
+ q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
3086
+ q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
3087
+ q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
3088
+ q5_0 = _mm_add_epi8(q5l_0, q5h_0);
3089
+ q5_1 = _mm_add_epi8(q5l_1, q5h_1);
3090
+ hmask = _mm_slli_epi16(hmask, 1);
3091
+
3092
+ q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3093
+ q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3094
+ __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
3095
+ __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
3096
+ p16_2 = _mm_madd_epi16(scale_1, p16_2);
3097
+ p16_3 = _mm_madd_epi16(scale_1, p16_3);
3098
+
3099
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
3100
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
3101
 
3102
  }
3103
 
3104
  __m256 vd = _mm256_set1_ps(d);
3105
+ __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3106
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
3107
 
3108
  }
3109
 
 
3173
  #endif
3174
  }
3175
 
3176
+ #else
3177
+
3178
+ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3179
+ assert(n % QK_K == 0);
3180
+
3181
+ const block_q5_K * restrict x = vx;
3182
+ const block_q8_K * restrict y = vy;
3183
+
3184
+ const int nb = n / QK_K;
3185
+
3186
+ #ifdef __ARM_NEON
3187
+
3188
+ const uint8x16_t m4b = vdupq_n_u8(0xf);
3189
+ const int32x4_t mzero = vdupq_n_s32(0);
3190
+ const uint8x16_t mh = vdupq_n_u8(16);
3191
+
3192
+ int8x16x4_t q5bytes;
3193
+ uint8x16x4_t q5h;
3194
+
3195
+ float sumf = 0;
3196
+
3197
+ for (int i = 0; i < nb; ++i) {
3198
+
3199
+ const float d = y[i].d * (float)x[i].d;
3200
+ const int8_t * sc = x[i].scales;
3201
+
3202
+ const uint8_t * restrict q5 = x[i].qs;
3203
+ const uint8_t * restrict qh = x[i].qh;
3204
+ const int8_t * restrict q8 = y[i].qs;
3205
+
3206
+ const uint8x8_t qhbits = vld1_u8(qh);
3207
+
3208
+ const uint8x16x2_t q5bits = vld1q_u8_x2(q5);
3209
+ const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
3210
+
3211
+ const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
3212
+ q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
3213
+ q5h.val[1] = vbicq_u8(mh, vshlq_n_u8(htmp, 2));
3214
+ q5h.val[2] = vbicq_u8(mh, htmp);
3215
+ q5h.val[3] = vbicq_u8(mh, vshrq_n_u8(htmp, 2));
3216
+
3217
+ q5bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[0], m4b)), vreinterpretq_s8_u8(q5h.val[0]));
3218
+ q5bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[1], m4b)), vreinterpretq_s8_u8(q5h.val[1]));
3219
+ q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
3220
+ q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
3221
+
3222
+ #if defined(__ARM_FEATURE_DOTPROD)
3223
+
3224
+ int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
3225
+ int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
3226
+ int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
3227
+ int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
3228
+
3229
+ sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
3230
+
3231
+ #else
3232
+
3233
+ const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
3234
+ vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
3235
+ const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
3236
+ vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
3237
+ int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1);
3238
+
3239
+ const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
3240
+ vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
3241
+ const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
3242
+ vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
3243
+ sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3);
3244
+
3245
+ sumf += d*sumi;
3246
+ #endif
3247
+
3248
+ }
3249
+
3250
+ *s = sumf;
3251
+
3252
+ #elif defined __AVX2__
3253
+
3254
+ const __m256i m4 = _mm256_set1_epi8(0xF);
3255
+ const __m256i mone = _mm256_set1_epi8(1);
3256
+
3257
+ __m256 acc = _mm256_setzero_ps();
3258
+
3259
+ for (int i = 0; i < nb; ++i) {
3260
+
3261
+ const uint8_t * restrict q5 = x[i].qs;
3262
+ const int8_t * restrict q8 = y[i].qs;
3263
+
3264
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
3265
+
3266
+ const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3267
+
3268
+ const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3269
+ const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3270
+
3271
+ int64_t aux64;
3272
+ memcpy(&aux64, x[i].qh, 8);
3273
+ const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
3274
+ const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
3275
+
3276
+ const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
3277
+ const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
3278
+
3279
+ const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
3280
+ const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
3281
+
3282
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
3283
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
3284
+
3285
+ const __m256i p16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5l_0, q8_0));
3286
+ const __m256i p16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5l_1, q8_1));
3287
+ const __m256i s16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5h_0, q8_0));
3288
+ const __m256i s16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5h_1, q8_1));
3289
+
3290
+ const __m256i dot = _mm256_sub_epi32(_mm256_add_epi32(p16_0, p16_1), _mm256_add_epi32(s16_0, s16_1));
3291
+
3292
+ acc = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(dot), acc);
3293
+
3294
+ }
3295
+
3296
+ *s = hsum_float_8(acc);
3297
+
3298
+ #else
3299
+
3300
+
3301
+ uint8_t aux8[QK_K];
3302
+ int16_t aux16[16];
3303
+ float sums [8];
3304
+ memset(sums, 0, 8*sizeof(float));
3305
+
3306
+ float sumf = 0;
3307
+ for (int i = 0; i < nb; ++i) {
3308
+ const uint8_t * restrict q4 = x[i].qs;
3309
+ const uint8_t * restrict hm = x[i].qh;
3310
+ const int8_t * restrict q8 = y[i].qs;
3311
+ uint8_t * restrict a = aux8;
3312
+ for (int l = 0; l < 32; ++l) {
3313
+ a[l+ 0] = q4[l] & 0xF;
3314
+ a[l+32] = q4[l] >> 4;
3315
+ }
3316
+ for (int is = 0; is < 8; ++is) {
3317
+ uint8_t m = 1 << is;
3318
+ for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
3319
+ }
3320
+
3321
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
3322
+ const int8_t * restrict sc = x[i].scales;
3323
+
3324
+ for (int j = 0; j < QK_K/16; ++j) {
3325
+ const float dl = d * sc[j];
3326
+ for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
3327
+ for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
3328
+ q8 += 16; a += 16;
3329
+ }
3330
+ }
3331
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
3332
+ *s = sumf;
3333
+ #endif
3334
+ }
3335
+ #endif
3336
 
3337
 
3338
+ #if QK_K == 256
3339
  void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3340
  assert(n % QK_K == 0);
3341
 
 
3560
 
3561
  *s = hsum_float_8(acc);
3562
 
3563
+ #elif defined __AVX__
3564
+
3565
+ const __m128i m4 = _mm_set1_epi8(0xF);
3566
+ const __m128i m3 = _mm_set1_epi8(3);
3567
+ const __m128i m32s = _mm_set1_epi8(32);
3568
+ const __m128i m2 = _mm_set1_epi8(2);
3569
+
3570
+ __m256 acc = _mm256_setzero_ps();
3571
+
3572
+ for (int i = 0; i < nb; ++i) {
3573
+
3574
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
3575
+
3576
+ const uint8_t * restrict q4 = x[i].ql;
3577
+ const uint8_t * restrict qh = x[i].qh;
3578
+ const int8_t * restrict q8 = y[i].qs;
3579
+
3580
+ const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
3581
+
3582
+ __m128i sumi_0 = _mm_setzero_si128();
3583
+ __m128i sumi_1 = _mm_setzero_si128();
3584
+
3585
+ __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
3586
+ for (int j = 0; j < QK_K/128; ++j) {
3587
+
3588
+ const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
3589
+ const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
3590
+
3591
+ const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
3592
+ const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
3593
+ const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
3594
+ const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
3595
+ const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
3596
+ const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
3597
+ const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
3598
+ const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
3599
+
3600
+ const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
3601
+ const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
3602
+ const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
3603
+ const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
3604
+
3605
+ const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
3606
+ const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
3607
+ const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
3608
+ const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
3609
+ const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
3610
+ const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
3611
+ const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
3612
+ const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
3613
+
3614
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3615
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3616
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3617
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3618
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3619
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3620
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3621
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
3622
+
3623
+ __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
3624
+ __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
3625
+ __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
3626
+ __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
3627
+ __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
3628
+ __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
3629
+ __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
3630
+ __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
3631
+
3632
+ __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
3633
+ __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
3634
+ __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
3635
+ __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
3636
+ __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
3637
+ __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
3638
+ __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
3639
+ __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
3640
+
3641
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
3642
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
3643
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
3644
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
3645
+ p16_4 = _mm_sub_epi16(p16_4, q8s_4);
3646
+ p16_5 = _mm_sub_epi16(p16_5, q8s_5);
3647
+ p16_6 = _mm_sub_epi16(p16_6, q8s_6);
3648
+ p16_7 = _mm_sub_epi16(p16_7, q8s_7);
3649
+
3650
+ const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
3651
+ shuffle = _mm_add_epi8(shuffle, m2);
3652
+ const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
3653
+ shuffle = _mm_add_epi8(shuffle, m2);
3654
+ const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
3655
+ shuffle = _mm_add_epi8(shuffle, m2);
3656
+ const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
3657
+ shuffle = _mm_add_epi8(shuffle, m2);
3658
+
3659
+ p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
3660
+ p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
3661
+ p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
3662
+ p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
3663
+ p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
3664
+ p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
3665
+ p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
3666
+ p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
3667
+
3668
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
3669
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
3670
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
3671
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
3672
+
3673
+ }
3674
+
3675
+ __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3676
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
3677
+ }
3678
+
3679
+ *s = hsum_float_8(acc);
3680
+
3681
  #else
3682
 
3683
  int8_t aux8[QK_K];
 
3722
  *s = sumf;
3723
  #endif
3724
  }
3725
+
3726
+ #else
3727
+
3728
+ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3729
+ assert(n % QK_K == 0);
3730
+
3731
+ const block_q6_K * restrict x = vx;
3732
+ const block_q8_K * restrict y = vy;
3733
+
3734
+ const int nb = n / QK_K;
3735
+
3736
+ #ifdef __ARM_NEON
3737
+
3738
+ float sum = 0;
3739
+
3740
+ const uint8x16_t m4b = vdupq_n_u8(0xF);
3741
+ const int32x4_t vzero = vdupq_n_s32(0);
3742
+ const int8x16_t m32s = vdupq_n_s8(32);
3743
+
3744
+ const uint8x16_t mone = vdupq_n_u8(3);
3745
+
3746
+ int8x16x4_t q6bytes;
3747
+ uint8x16x4_t q6h;
3748
+
3749
+ for (int i = 0; i < nb; ++i) {
3750
+
3751
+ const float d_all = (float)x[i].d;
3752
+
3753
+ const uint8_t * restrict q6 = x[i].ql;
3754
+ const uint8_t * restrict qh = x[i].qh;
3755
+ const int8_t * restrict q8 = y[i].qs;
3756
+
3757
+ const int8_t * restrict scale = x[i].scales;
3758
+
3759
+ int32_t isum = 0;
3760
+
3761
+ uint8x16_t qhbits = vld1q_u8(qh);
3762
+ uint8x16x2_t q6bits = vld1q_u8_x2(q6);
3763
+ int8x16x4_t q8bytes = vld1q_s8_x4(q8);
3764
+
3765
+ q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
3766
+ uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
3767
+ q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
3768
+ shifted = vshrq_n_u8(qhbits, 4);
3769
+ q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
3770
+ shifted = vshrq_n_u8(qhbits, 6);
3771
+ q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
3772
+
3773
+ q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
3774
+ q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
3775
+ q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
3776
+ q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
3777
+
3778
+ #if defined(__ARM_FEATURE_DOTPROD)
3779
+
3780
+ isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
3781
+ vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
3782
+ vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
3783
+ vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
3784
+ #else
3785
+
3786
+ int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
3787
+ vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
3788
+ int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
3789
+ vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
3790
+ isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
3791
+
3792
+ int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
3793
+ vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
3794
+ int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
3795
+ vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
3796
+ isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
3797
+ #endif
3798
+
3799
+ sum += isum * d_all * y[i].d;
3800
+
3801
+ }
3802
+ *s = sum;
3803
+
3804
+ #elif defined __AVX2__
3805
+
3806
+ const __m256i m4 = _mm256_set1_epi8(0xF);
3807
+ const __m256i m2 = _mm256_set1_epi8(3);
3808
+ const __m256i m32s = _mm256_set1_epi8(32);
3809
+
3810
+ __m256 acc = _mm256_setzero_ps();
3811
+
3812
+ for (int i = 0; i < nb; ++i) {
3813
+
3814
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
3815
+
3816
+ const uint8_t * restrict q4 = x[i].ql;
3817
+ const uint8_t * restrict qh = x[i].qh;
3818
+ const int8_t * restrict q8 = y[i].qs;
3819
+
3820
+ const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
3821
+ const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
3822
+ const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
3823
+ const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
3824
+
3825
+ __m256i sumi = _mm256_setzero_si256();
3826
+
3827
+ const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
3828
+ const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
3829
+
3830
+ const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
3831
+ const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
3832
+
3833
+ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
3834
+ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
3835
+
3836
+ const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
3837
+ const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
3838
+
3839
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
3840
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
3841
+
3842
+ __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
3843
+ __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
3844
+
3845
+ __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
3846
+ __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
3847
+
3848
+ p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
3849
+ p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
3850
+
3851
+ p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
3852
+ p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
3853
+
3854
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
3855
+
3856
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
3857
+ }
3858
+
3859
+ *s = hsum_float_8(acc);
3860
+
3861
+ #else
3862
+
3863
+ int8_t aux8[QK_K];
3864
+ int16_t aux16[8];
3865
+ float sums [8];
3866
+ int32_t aux32[8];
3867
+ memset(sums, 0, 8*sizeof(float));
3868
+
3869
+ float sumf = 0;
3870
+ for (int i = 0; i < nb; ++i) {
3871
+ const uint8_t * restrict q4 = x[i].ql;
3872
+ const uint8_t * restrict qh = x[i].qh;
3873
+ const int8_t * restrict q8 = y[i].qs;
3874
+ memset(aux32, 0, 8*sizeof(int32_t));
3875
+ int8_t * restrict a = aux8;
3876
+ for (int l = 0; l < 16; ++l) {
3877
+ a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
3878
+ a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
3879
+ a[l+32] = (int8_t)((q4[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
3880
+ a[l+48] = (int8_t)((q4[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
3881
+ }
3882
+ int is = 0;
3883
+ for (int j = 0; j < QK_K/16; ++j) {
3884
+ int scale = x[i].scales[is++];
3885
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
3886
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
3887
+ q8 += 8; a += 8;
3888
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
3889
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
3890
+ q8 += 8; a += 8;
3891
+ }
3892
+ const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
3893
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
3894
+ }
3895
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
3896
+ *s = sumf;
3897
+ #endif
3898
+ }
3899
+
3900
+ #endif
k_quants.h CHANGED
@@ -7,7 +7,13 @@
7
  #include <stddef.h>
8
 
9
  // Super-block size
 
 
 
 
10
  #define QK_K 256
 
 
11
 
12
  //
13
  // Super-block quantization structures
@@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
29
  // weight is represented as x = a * q
30
  // 16 blocks of 16 elemenets each
31
  // Effectively 3.4375 bits per weight
 
32
  typedef struct {
33
  uint8_t hmask[QK_K/8]; // quants - high bit
34
  uint8_t qs[QK_K/4]; // quants - low 2 bits
35
- uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
36
  ggml_fp16_t d; // super-block scale
37
  } block_q3_K;
38
- static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
 
 
 
 
 
 
 
 
 
39
 
40
  // 4-bit quantization
41
  // 16 blocks of 32 elements each
42
  // weight is represented as x = a * q + b
43
  // Effectively 4.5 bits per weight
 
 
 
 
 
 
 
 
44
  typedef struct {
45
  ggml_fp16_t d; // super-block scale for quantized scales
46
  ggml_fp16_t dmin; // super-block scale for quantized mins
47
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
48
  uint8_t qs[QK_K/2]; // 4--bit quants
49
  } block_q4_K;
50
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
 
51
 
52
  // 5-bit quantization
53
  // 16 blocks of 32 elements each
54
  // weight is represented as x = a * q + b
55
  // Effectively 5.5 bits per weight
 
 
 
 
 
 
 
 
 
56
  typedef struct {
57
  ggml_fp16_t d; // super-block scale for quantized scales
58
  ggml_fp16_t dmin; // super-block scale for quantized mins
59
- uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
60
  uint8_t qh[QK_K/8]; // quants, high bit
61
  uint8_t qs[QK_K/2]; // quants, low 4 bits
62
  } block_q5_K;
63
- static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 
64
 
65
  // 6-bit quantization
66
  // weight is represented as x = a * q
 
7
  #include <stddef.h>
8
 
9
  // Super-block size
10
+ #ifdef GGML_QKK_64
11
+ #define QK_K 64
12
+ #define K_SCALE_SIZE 4
13
+ #else
14
  #define QK_K 256
15
+ #define K_SCALE_SIZE 12
16
+ #endif
17
 
18
  //
19
  // Super-block quantization structures
 
35
  // weight is represented as x = a * q
36
  // 16 blocks of 16 elemenets each
37
  // Effectively 3.4375 bits per weight
38
+ #ifdef GGML_QKK_64
39
  typedef struct {
40
  uint8_t hmask[QK_K/8]; // quants - high bit
41
  uint8_t qs[QK_K/4]; // quants - low 2 bits
42
+ uint8_t scales[2];
43
  ggml_fp16_t d; // super-block scale
44
  } block_q3_K;
45
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
46
+ #else
47
+ typedef struct {
48
+ uint8_t hmask[QK_K/8]; // quants - high bit
49
+ uint8_t qs[QK_K/4]; // quants - low 2 bits
50
+ uint8_t scales[12]; // scales, quantized with 6 bits
51
+ ggml_fp16_t d; // super-block scale
52
+ } block_q3_K;
53
+ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
54
+ #endif
55
 
56
  // 4-bit quantization
57
  // 16 blocks of 32 elements each
58
  // weight is represented as x = a * q + b
59
  // Effectively 4.5 bits per weight
60
+ #ifdef GGML_QKK_64
61
+ typedef struct {
62
+ ggml_fp16_t d[2]; // super-block scales/mins
63
+ uint8_t scales[2]; // 4-bit block scales/mins
64
+ uint8_t qs[QK_K/2]; // 4--bit quants
65
+ } block_q4_K;
66
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
67
+ #else
68
  typedef struct {
69
  ggml_fp16_t d; // super-block scale for quantized scales
70
  ggml_fp16_t dmin; // super-block scale for quantized mins
71
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
72
  uint8_t qs[QK_K/2]; // 4--bit quants
73
  } block_q4_K;
74
+ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
75
+ #endif
76
 
77
  // 5-bit quantization
78
  // 16 blocks of 32 elements each
79
  // weight is represented as x = a * q + b
80
  // Effectively 5.5 bits per weight
81
+ #ifdef GGML_QKK_64
82
+ typedef struct {
83
+ ggml_fp16_t d; // super-block scale
84
+ int8_t scales[QK_K/16]; // 8-bit block scales
85
+ uint8_t qh[QK_K/8]; // quants, high bit
86
+ uint8_t qs[QK_K/2]; // quants, low 4 bits
87
+ } block_q5_K;
88
+ static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
89
+ #else
90
  typedef struct {
91
  ggml_fp16_t d; // super-block scale for quantized scales
92
  ggml_fp16_t dmin; // super-block scale for quantized mins
93
+ uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
94
  uint8_t qh[QK_K/8]; // quants, high bit
95
  uint8_t qs[QK_K/2]; // quants, low 4 bits
96
  } block_q5_K;
97
+ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
98
+ #endif
99
 
100
  // 6-bit quantization
101
  // weight is represented as x = a * q
klite.embd CHANGED
The diff for this file is too large to render. See raw diff
 
koboldcpp.py CHANGED
@@ -16,6 +16,7 @@ class load_model_inputs(ctypes.Structure):
16
  ("max_context_length", ctypes.c_int),
17
  ("batch_size", ctypes.c_int),
18
  ("f16_kv", ctypes.c_bool),
 
19
  ("executable_path", ctypes.c_char_p),
20
  ("model_filename", ctypes.c_char_p),
21
  ("lora_filename", ctypes.c_char_p),
@@ -77,17 +78,18 @@ lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.s
77
  lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so")
78
  lib_openblas_noavx2 = pick_existant_file("koboldcpp_openblas_noavx2.dll","koboldcpp_openblas_noavx2.so")
79
  lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
 
80
 
81
 
82
  def init_library():
83
  global handle
84
- global lib_default,lib_failsafe,lib_openblas,lib_openblas_noavx2,lib_clblast
85
 
86
  libname = ""
87
  use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
88
  use_clblast = False #uses CLBlast instead
 
89
  use_noavx2 = False #uses openblas with no avx2 instructions
90
-
91
  if args.noavx2:
92
  use_noavx2 = True
93
  if not file_exists(lib_openblas_noavx2) or (os.name=='nt' and not file_exists("libopenblas.dll")):
@@ -103,6 +105,12 @@ def init_library():
103
  else:
104
  print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
105
  use_clblast = True
 
 
 
 
 
 
106
  else:
107
  if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")):
108
  print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
@@ -122,6 +130,8 @@ def init_library():
122
  else:
123
  if use_clblast:
124
  libname = lib_clblast
 
 
125
  elif use_blas:
126
  libname = lib_openblas
127
  else:
@@ -150,6 +160,7 @@ def load_model(model_filename):
150
  inputs.batch_size = 8
151
  inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
152
  inputs.threads = args.threads
 
153
  inputs.blasthreads = args.blasthreads
154
  inputs.f16_kv = True
155
  inputs.use_mmap = (not args.nommap)
@@ -225,7 +236,7 @@ maxhordectx = 1024
225
  maxhordelen = 256
226
  modelbusy = False
227
  defaultport = 5001
228
- KcppVersion = "1.33"
229
  showdebug = True
230
 
231
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
@@ -581,13 +592,13 @@ def show_gui():
581
  blaschoice = tk.StringVar()
582
  blaschoice.set("BLAS = 512")
583
 
584
- runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"]
585
  runchoice = tk.StringVar()
586
  runchoice.set("Use OpenBLAS")
587
 
588
  def onDropdownChange(event):
589
  sel = runchoice.get()
590
- if sel==runopts[1] or sel==runopts[2] or sel==runopts[3]:
591
  frameC.grid(row=4,column=0,pady=4)
592
  else:
593
  frameC.grid_forget()
@@ -609,7 +620,7 @@ def show_gui():
609
  frameC = tk.Frame(root)
610
  gpu_layers_var=tk.StringVar()
611
  gpu_layers_var.set("0")
612
- gpu_lbl = tk.Label(frameC, text = 'GPU Layers (CLBlast only): ', font=('calibre',10, 'bold'))
613
  gpu_layers_input = tk.Entry(frameC,textvariable = gpu_layers_var, font=('calibre',10,'normal'))
614
  gpu_lbl.grid(row=0,column=0)
615
  gpu_layers_input.grid(row=0,column=1)
@@ -663,11 +674,13 @@ def show_gui():
663
  if selrunchoice==runopts[3]:
664
  args.useclblast = [0,1]
665
  if selrunchoice==runopts[4]:
666
- args.noblas = True
667
  if selrunchoice==runopts[5]:
668
- args.noavx2 = True
669
  if selrunchoice==runopts[6]:
670
  args.noavx2 = True
 
 
671
  args.noblas = True
672
  args.nommap = True
673
  print("[Failsafe Mode : mmap is disabled.]")
@@ -848,7 +861,7 @@ if __name__ == '__main__':
848
  parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
849
  parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,4096,8192], default=2048)
850
  parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
851
- parser.add_argument("--stream", help="Uses pseudo streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
852
  parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
853
  parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')
854
  parser.add_argument("--usemirostat", help="Experimental! Replaces your samplers with mirostat. Takes 3 params = [type(0/1/2), tau(5.0), eta(0.1)].",metavar=('[type]', '[tau]', '[eta]'), type=float, nargs=3)
@@ -861,7 +874,8 @@ if __name__ == '__main__':
861
  parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength and max ctxlen.",metavar=('[hordename]', '[hordelength] [hordectx]'), nargs='+')
862
  compatgroup = parser.add_mutually_exclusive_group()
863
  compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
864
- compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
865
- parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using CLBlast. Requires CLBlast.",metavar=('[GPU layers]'), type=int, default=0)
 
866
  args = parser.parse_args()
867
  main(args)
 
16
  ("max_context_length", ctypes.c_int),
17
  ("batch_size", ctypes.c_int),
18
  ("f16_kv", ctypes.c_bool),
19
+ ("low_vram", ctypes.c_bool),
20
  ("executable_path", ctypes.c_char_p),
21
  ("model_filename", ctypes.c_char_p),
22
  ("lora_filename", ctypes.c_char_p),
 
78
  lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so")
79
  lib_openblas_noavx2 = pick_existant_file("koboldcpp_openblas_noavx2.dll","koboldcpp_openblas_noavx2.so")
80
  lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
81
+ lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so")
82
 
83
 
84
  def init_library():
85
  global handle
86
+ global lib_default,lib_failsafe,lib_openblas,lib_openblas_noavx2,lib_clblast,lib_cublas
87
 
88
  libname = ""
89
  use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
90
  use_clblast = False #uses CLBlast instead
91
+ use_cublas = False #uses cublas instead
92
  use_noavx2 = False #uses openblas with no avx2 instructions
 
93
  if args.noavx2:
94
  use_noavx2 = True
95
  if not file_exists(lib_openblas_noavx2) or (os.name=='nt' and not file_exists("libopenblas.dll")):
 
105
  else:
106
  print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
107
  use_clblast = True
108
+ elif (args.usecublas and args.usecublas!=""):
109
+ if not file_exists(lib_cublas):
110
+ print("Warning: CuBLAS library file not found. Non-BLAS library will be used.")
111
+ else:
112
+ print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.")
113
+ use_cublas = True
114
  else:
115
  if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")):
116
  print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
 
130
  else:
131
  if use_clblast:
132
  libname = lib_clblast
133
+ elif use_cublas:
134
+ libname = lib_cublas
135
  elif use_blas:
136
  libname = lib_openblas
137
  else:
 
160
  inputs.batch_size = 8
161
  inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
162
  inputs.threads = args.threads
163
+ inputs.low_vram = (True if args.usecublas=="lowvram" else False)
164
  inputs.blasthreads = args.blasthreads
165
  inputs.f16_kv = True
166
  inputs.use_mmap = (not args.nommap)
 
236
  maxhordelen = 256
237
  modelbusy = False
238
  defaultport = 5001
239
+ KcppVersion = "1.34"
240
  showdebug = True
241
 
242
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
 
592
  blaschoice = tk.StringVar()
593
  blaschoice.set("BLAS = 512")
594
 
595
+ runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"]
596
  runchoice = tk.StringVar()
597
  runchoice.set("Use OpenBLAS")
598
 
599
  def onDropdownChange(event):
600
  sel = runchoice.get()
601
+ if sel==runopts[1] or sel==runopts[2] or sel==runopts[3] or sel==runopts[4]:
602
  frameC.grid(row=4,column=0,pady=4)
603
  else:
604
  frameC.grid_forget()
 
620
  frameC = tk.Frame(root)
621
  gpu_layers_var=tk.StringVar()
622
  gpu_layers_var.set("0")
623
+ gpu_lbl = tk.Label(frameC, text = 'GPU Layers: ', font=('calibre',10, 'bold'))
624
  gpu_layers_input = tk.Entry(frameC,textvariable = gpu_layers_var, font=('calibre',10,'normal'))
625
  gpu_lbl.grid(row=0,column=0)
626
  gpu_layers_input.grid(row=0,column=1)
 
674
  if selrunchoice==runopts[3]:
675
  args.useclblast = [0,1]
676
  if selrunchoice==runopts[4]:
677
+ args.usecublas = True
678
  if selrunchoice==runopts[5]:
679
+ args.noblas = True
680
  if selrunchoice==runopts[6]:
681
  args.noavx2 = True
682
+ if selrunchoice==runopts[7]:
683
+ args.noavx2 = True
684
  args.noblas = True
685
  args.nommap = True
686
  print("[Failsafe Mode : mmap is disabled.]")
 
861
  parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
862
  parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,4096,8192], default=2048)
863
  parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
864
+ parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
865
  parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
866
  parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')
867
  parser.add_argument("--usemirostat", help="Experimental! Replaces your samplers with mirostat. Takes 3 params = [type(0/1/2), tau(5.0), eta(0.1)].",metavar=('[type]', '[tau]', '[eta]'), type=float, nargs=3)
 
874
  parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength and max ctxlen.",metavar=('[hordename]', '[hordelength] [hordectx]'), nargs='+')
875
  compatgroup = parser.add_mutually_exclusive_group()
876
  compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
877
+ compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
878
+ compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU. Select lowvram to not allocate VRAM scratch buffer.", default='', const='normal', nargs='?', choices=['normal', 'lowvram'])
879
+ parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
880
  args = parser.parse_args()
881
  main(args)
llama-util.h CHANGED
@@ -172,12 +172,14 @@ struct llama_mmap {
172
  #ifdef _POSIX_MAPPED_FILES
173
  static constexpr bool SUPPORTED = true;
174
 
175
- llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
176
  size = file->size;
177
  int fd = fileno(file->fp);
178
  int flags = MAP_SHARED;
 
 
179
  #ifdef __linux__
180
- flags |= MAP_POPULATE;
181
  #endif
182
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
183
  if (addr == MAP_FAILED) {
@@ -191,6 +193,14 @@ struct llama_mmap {
191
  strerror(errno));
192
  }
193
  }
 
 
 
 
 
 
 
 
194
  }
195
 
196
  ~llama_mmap() {
@@ -199,7 +209,9 @@ struct llama_mmap {
199
  #elif defined(_WIN32)
200
  static constexpr bool SUPPORTED = true;
201
 
202
- llama_mmap(struct llama_file * file, bool prefetch = true) {
 
 
203
  size = file->size;
204
 
205
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -248,8 +260,10 @@ struct llama_mmap {
248
  #else
249
  static constexpr bool SUPPORTED = false;
250
 
251
- llama_mmap(struct llama_file *, bool prefetch = true) {
252
- (void)prefetch;
 
 
253
  throw std::runtime_error(std::string("mmap not supported"));
254
  }
255
  #endif
 
172
  #ifdef _POSIX_MAPPED_FILES
173
  static constexpr bool SUPPORTED = true;
174
 
175
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
176
  size = file->size;
177
  int fd = fileno(file->fp);
178
  int flags = MAP_SHARED;
179
+ // prefetch/readahead impairs performance on NUMA systems
180
+ if (numa) { prefetch = 0; }
181
  #ifdef __linux__
182
+ if (prefetch) { flags |= MAP_POPULATE; }
183
  #endif
184
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
185
  if (addr == MAP_FAILED) {
 
193
  strerror(errno));
194
  }
195
  }
196
+ if (numa) {
197
+ // advise the kernel not to use readahead
198
+ // (because the next page might not belong on the same node)
199
+ if (madvise(addr, file->size, MADV_RANDOM)) {
200
+ fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
201
+ strerror(errno));
202
+ }
203
+ }
204
  }
205
 
206
  ~llama_mmap() {
 
209
  #elif defined(_WIN32)
210
  static constexpr bool SUPPORTED = true;
211
 
212
+ llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
213
+ (void) numa;
214
+
215
  size = file->size;
216
 
217
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
 
260
  #else
261
  static constexpr bool SUPPORTED = false;
262
 
263
+ llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
264
+ (void) prefetch;
265
+ (void) numa;
266
+
267
  throw std::runtime_error(std::string("mmap not supported"));
268
  }
269
  #endif
llama.cpp CHANGED
@@ -12,7 +12,8 @@
12
  #include "ggml.h"
13
  #ifdef GGML_USE_CUBLAS
14
  #include "ggml-cuda.h"
15
- #elif defined(GGML_USE_CLBLAST)
 
16
  #include "ggml-opencl.h"
17
  #endif
18
 
@@ -21,9 +22,13 @@
21
  #endif
22
  #ifdef GGML_USE_K_QUANTS
23
  #ifndef QK_K
 
 
 
24
  #define QK_K 256
25
  #endif
26
  #endif
 
27
 
28
  #include <array>
29
  #include <ctime>
@@ -62,6 +67,7 @@ enum e_model {
62
  MODEL_65B,
63
  };
64
 
 
65
  static const size_t MB = 1024*1024;
66
 
67
  // computed for n_ctx == 2048
@@ -125,6 +131,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
125
  return k_sizes;
126
  }
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  // default hparams (LLaMA 7B)
129
  struct llama_hparams {
130
  uint32_t n_vocab = 32000;
@@ -360,96 +394,14 @@ static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml
360
  return size / ggml_blck_size(type);
361
  }
362
 
363
- struct llama_load_tensor_shard {
364
- std::vector<uint32_t> ne;
365
- size_t size;
366
- enum ggml_type type;
367
- size_t file_idx;
368
- size_t file_off;
369
-
370
- void calc_size() {
371
- size = llama_calc_tensor_size(ne, type);
372
- }
373
- };
374
-
375
- enum llama_split_type {
376
- SPLIT_NONE,
377
- SPLIT_BY_COLUMNS,
378
- SPLIT_BY_ROWS
379
- };
380
-
381
  struct llama_load_tensor {
382
- std::vector<llama_load_tensor_shard> shards;
383
-
384
  std::string name;
385
  enum ggml_type type = GGML_TYPE_F32;
386
- llama_split_type split_type = SPLIT_NONE;
387
  std::vector<uint32_t> ne;
 
388
  size_t size;
389
  struct ggml_tensor * ggml_tensor = NULL;
390
  uint8_t * data;
391
-
392
- llama_load_tensor(const std::string & name) : name(name) {}
393
-
394
- void calc_all() {
395
- calc_type();
396
- calc_split_type();
397
- calc_ne();
398
- calc_size();
399
- }
400
-
401
- void calc_type() {
402
- const auto & first_shard = shards.at(0);
403
- for (const auto & shard : shards) {
404
- if (shard.type != first_shard.type) {
405
- throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
406
- }
407
- }
408
- type = first_shard.type;
409
- }
410
-
411
- void calc_split_type() {
412
- if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
413
- shards.size() == 1) { // only one file?
414
- split_type = SPLIT_NONE;
415
- } else if (name.find("tok_embeddings.") == 0 ||
416
- name.find(".attention.wo.weight") != std::string::npos ||
417
- name.find(".feed_forward.w2.weight") != std::string::npos) {
418
- split_type = SPLIT_BY_COLUMNS;
419
- } else {
420
- split_type = SPLIT_BY_ROWS;
421
- }
422
- }
423
-
424
- void calc_ne() {
425
- const auto & first_shard = shards.at(0);
426
- for (const auto & shard : shards) {
427
- if (shard.ne != first_shard.ne) {
428
- throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
429
- name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
430
- }
431
- }
432
- ne = first_shard.ne;
433
- LLAMA_ASSERT(shards.size() <= UINT32_MAX);
434
- uint32_t n_shards = (uint32_t) shards.size();
435
- switch (split_type) {
436
- case SPLIT_NONE:
437
- ne = first_shard.ne;
438
- break;
439
- case SPLIT_BY_COLUMNS:
440
- ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
441
- first_shard.ne[1]};
442
- break;
443
- case SPLIT_BY_ROWS:
444
- ne = {first_shard.ne[0],
445
- checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
446
- break;
447
- }
448
- }
449
-
450
- void calc_size() {
451
- size = llama_calc_tensor_size(ne, type);
452
- }
453
  };
454
 
455
  struct llama_load_tensors_map {
@@ -472,13 +424,13 @@ struct llama_file_loader {
472
  llama_hparams hparams;
473
  llama_vocab vocab;
474
 
475
- llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
476
  : file(fname, "rb") {
477
  fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
478
  read_magic();
479
  read_hparams();
480
  read_vocab();
481
- read_tensor_metadata(file_idx, tensors_map);
482
  }
483
  void read_magic() {
484
  uint32_t magic = file.read_u32();
@@ -535,19 +487,19 @@ struct llama_file_loader {
535
  tok_score.score = score;
536
  }
537
  }
538
- void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) {
539
  while (file.tell() < file.size) {
540
- llama_load_tensor_shard shard;
541
  uint32_t n_dims = file.read_u32();
542
  uint32_t name_len = file.read_u32();
543
- shard.type = (enum ggml_type) file.read_u32();
544
- shard.ne.resize(n_dims);
545
- file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
546
  std::string name = file.read_string(name_len);
547
  if (n_dims < 1 || n_dims > 2) {
548
  throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
549
  }
550
- switch (shard.type) {
551
  case GGML_TYPE_F32:
552
  case GGML_TYPE_F16:
553
  case GGML_TYPE_Q4_0:
@@ -562,30 +514,20 @@ struct llama_file_loader {
562
  case GGML_TYPE_Q6_K:
563
  break;
564
  default: {
565
- throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
566
  }
567
  }
568
 
569
- if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
570
- // skip to the next multiple of 32 bytes
571
- file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
572
- }
573
- shard.file_idx = file_idx;
574
- shard.file_off = file.tell();
575
 
576
- shard.calc_size();
577
- file.seek(shard.size, SEEK_CUR);
 
 
578
 
579
- auto it = tensors_map.name_to_idx.find(name);
580
- size_t idx;
581
- if (it != tensors_map.name_to_idx.end()) {
582
- idx = it->second;
583
- } else {
584
- tensors_map.tensors.emplace_back(name);
585
- idx = tensors_map.tensors.size() - 1;
586
- tensors_map.name_to_idx.emplace(name, idx);
587
- }
588
- tensors_map.tensors.at(idx).shards.push_back(shard);
589
  }
590
  }
591
  };
@@ -655,56 +597,19 @@ struct llama_file_saver {
655
  };
656
 
657
  struct llama_model_loader {
658
- std::vector<std::unique_ptr<llama_file_loader>> file_loaders;
659
  llama_load_tensors_map tensors_map;
660
  bool use_mmap;
661
  size_t num_ggml_tensors_created = 0;
662
  struct ggml_context * ggml_ctx = NULL;
663
  std::unique_ptr<llama_mmap> mapping;
664
 
665
- llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
666
- auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
667
- file_loaders.emplace_back(first_file);
668
- uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
669
- for (uint32_t i = 1; i < n_parts; i++) {
670
- std::string fname = fname_base + "." + std::to_string(i);
671
- auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
672
- file_loaders.emplace_back(ith_file);
673
- if (ith_file->hparams != first_file->hparams) {
674
- throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
675
- }
676
- }
677
  if (!llama_mmap::SUPPORTED) {
678
  use_mmap = false;
679
  }
680
- if (use_mmap && alignment_prevents_mmap()) {
681
- fprintf(stderr, "llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
682
- use_mmap = false;
683
- }
684
  this->use_mmap = use_mmap;
685
- for (llama_load_tensor & lt : tensors_map.tensors) {
686
- lt.calc_all();
687
- }
688
- }
689
-
690
- bool alignment_prevents_mmap() {
691
- for (const llama_load_tensor & lt : tensors_map.tensors) {
692
- for (const llama_load_tensor_shard & shard : lt.shards) {
693
- if (shard.file_off & 3) {
694
- return true;
695
- }
696
- }
697
- }
698
- return false;
699
- }
700
-
701
- uint32_t guess_n_parts() const {
702
- auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
703
- if (it == tensors_map.name_to_idx.end()) {
704
- throw std::runtime_error(std::string("missing tok_embeddings.weight"));
705
- }
706
- const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
707
- return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
708
  }
709
 
710
  void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
@@ -770,7 +675,7 @@ struct llama_model_loader {
770
  }
771
 
772
  if (use_mmap) {
773
- mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
774
  if (lmlock) {
775
  lmlock->init(mapping->addr);
776
  }
@@ -826,45 +731,13 @@ struct llama_model_loader {
826
 
827
  void load_data_for(llama_load_tensor & lt) {
828
  if (use_mmap) {
829
- LLAMA_ASSERT(lt.shards.size() == 1);
830
- lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
831
- } else if (lt.split_type == SPLIT_NONE) {
832
- llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
833
- file.seek(lt.shards.at(0).file_off, SEEK_SET);
834
  file.read_raw(lt.data, lt.size);
835
- } else if (lt.split_type == SPLIT_BY_ROWS) {
836
- size_t offset = 0;
837
- for (llama_load_tensor_shard & shard : lt.shards) {
838
- llama_file & file = file_loaders.at(shard.file_idx)->file;
839
- file.seek(shard.file_off, SEEK_SET);
840
- file.read_raw(lt.data + offset, shard.size);
841
- offset += shard.size;
842
- }
843
- LLAMA_ASSERT(offset == lt.size);
844
- } else if (lt.split_type == SPLIT_BY_COLUMNS) {
845
- // Let's load the data into temporary buffers to ensure the OS performs large loads.
846
- std::vector<llama_buffer> tmp_bufs(lt.shards.size());
847
- for (size_t i = 0; i < lt.shards.size(); i++) {
848
- llama_load_tensor_shard & shard = lt.shards.at(i);
849
- llama_file & file = file_loaders.at(shard.file_idx)->file;
850
- file.seek(shard.file_off, SEEK_SET);
851
- tmp_bufs.at(i).resize(shard.size);
852
- file.read_raw(tmp_bufs.at(i).addr, shard.size);
853
- }
854
- // Then reshape.
855
- size_t num_rows = lt.ne.at(1);
856
- size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
857
- size_t out_offset = 0;
858
- for (size_t row = 0; row < num_rows; row++) {
859
- for (llama_buffer & tmp_buf : tmp_bufs) {
860
- memcpy(lt.data + out_offset,
861
- tmp_buf.addr + row * per_shard_row_size,
862
- per_shard_row_size);
863
- out_offset += per_shard_row_size;
864
- }
865
- }
866
- LLAMA_ASSERT(out_offset == lt.size);
867
  }
 
868
  if (0) {
869
  print_checksum(lt);
870
  }
@@ -934,7 +807,7 @@ static bool kv_cache_init(
934
 
935
  struct llama_context_params llama_context_default_params() {
936
  struct llama_context_params result = {
937
- /*.seed =*/ -1,
938
  /*.n_ctx =*/ 512,
939
  /*.n_batch =*/ 512,
940
  /*.gpu_layers =*/ 0,
@@ -973,7 +846,7 @@ bool llama_mlock_supported() {
973
  return llama_mlock::SUPPORTED;
974
  }
975
 
976
- void llama_init_backend() {
977
  ggml_time_init();
978
 
979
  // needed to initialize f16 tables
@@ -982,6 +855,10 @@ void llama_init_backend() {
982
  struct ggml_context * ctx = ggml_init(params);
983
  ggml_free(ctx);
984
  }
 
 
 
 
985
  }
986
 
987
  int64_t llama_time_us() {
@@ -1059,12 +936,12 @@ static void llama_model_load_internal(
1059
 
1060
  model.t_start_us = ggml_time_us();
1061
 
1062
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
1063
 
1064
- vocab = std::move(ml->file_loaders.at(0)->vocab);
1065
- model.hparams = ml->file_loaders.at(0)->hparams;
1066
  model.n_gpu_layers = n_gpu_layers;
1067
- llama_file_version file_version = ml->file_loaders.at(0)->file_version;
1068
  auto & hparams = model.hparams;
1069
 
1070
  {
@@ -1098,7 +975,6 @@ static void llama_model_load_internal(
1098
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
1099
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
1100
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
1101
- fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
1102
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
1103
  }
1104
 
@@ -1245,11 +1121,12 @@ static void llama_model_load_internal(
1245
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1246
 
1247
  // this is the total memory required to run the inference
 
1248
  const size_t mem_required =
1249
  ctx_size +
1250
  mmapped_size - vram_weights + // weights in VRAM not in memory
1251
- MEM_REQ_SCRATCH0().at(model.type) +
1252
- MEM_REQ_SCRATCH1().at(model.type) +
1253
  MEM_REQ_EVAL().at (model.type);
1254
 
1255
  // this is the memory required by one llama_state
@@ -1266,11 +1143,14 @@ static void llama_model_load_internal(
1266
  fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1267
  ggml_cuda_set_scratch_size(0); // disable scratch
1268
  } else {
1269
- vram_scratch = n_batch * MB;
 
 
1270
  ggml_cuda_set_scratch_size(vram_scratch);
1271
  if (n_gpu_layers > 0) {
1272
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1273
- __func__, vram_scratch / MB);
 
1274
  }
1275
  }
1276
  #endif // GGML_USE_CUBLAS
@@ -1361,18 +1241,20 @@ static bool llama_model_load(
1361
 
1362
  // evaluate the transformer
1363
  //
1364
- // - lctx: llama context
1365
- // - tokens: new batch of tokens to process
1366
- // - n_past: the context size so far
1367
- // - n_threads: number of threads to use
1368
- // - cgraph_fname: filename of the exported computation graph
 
1369
  //
1370
  static bool llama_eval_internal(
1371
- llama_context & lctx,
1372
- const llama_token * tokens,
1373
- const int n_tokens,
1374
- const int n_past,
1375
- const int n_threads,
 
1376
  const char * cgraph_fname) {
1377
 
1378
  // // enforce that the first token is BOS
@@ -1416,12 +1298,18 @@ static bool llama_eval_internal(
1416
  ggml_cgraph gf = {};
1417
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1418
 
1419
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1420
- ggml_set_name(embd, "embd");
1421
- memcpy(embd->data, tokens, N*ggml_element_size(embd));
1422
-
1423
  struct ggml_tensor * cur;
1424
- struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
 
 
 
 
 
 
 
 
 
 
1425
 
1426
  const int i_gpu_start = n_layer - n_gpu_layers;
1427
  (void) i_gpu_start;
@@ -1483,11 +1371,11 @@ static bool llama_eval_internal(
1483
  offload_func_kq(tmpq);
1484
  ggml_set_name(tmpq, "tmpq");
1485
 
1486
- struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1487
  offload_func_kq(Kcur);
1488
  ggml_set_name(Kcur, "Kcur");
1489
 
1490
- struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1491
  offload_func_kq(Qcur);
1492
  ggml_set_name(Qcur, "Qcur");
1493
 
@@ -2443,9 +2331,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2443
  nthread = std::thread::hardware_concurrency();
2444
  }
2445
 
2446
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
2447
- /*vocab_only*/ false));
2448
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
2449
 
2450
  #ifdef GGML_USE_K_QUANTS
2451
  int n_attention_wv = 0;
@@ -2470,6 +2357,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2470
  std::vector<std::thread> workers;
2471
  std::mutex mutex;
2472
 
 
 
 
 
2473
  size_t idx = 0;
2474
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
2475
  llama_buffer read_data;
@@ -2524,15 +2415,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2524
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2525
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2526
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2527
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
2528
- (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
 
2529
  ++i_attention_wv;
2530
  } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2531
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2532
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2533
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2534
- (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
2535
- (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
2536
  ++i_feed_forward_w2;
2537
  } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2538
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
@@ -2641,6 +2533,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2641
  }
2642
  }
2643
 
 
 
2644
  //
2645
  // interface implementation
2646
  //
@@ -2679,7 +2573,7 @@ struct llama_context * llama_new_context_with_model(
2679
 
2680
  llama_context * ctx = new llama_context(*model, model->vocab);
2681
 
2682
- if (params.seed < 0) {
2683
  params.seed = time(NULL);
2684
  }
2685
 
@@ -2733,8 +2627,9 @@ struct llama_context * llama_new_context_with_model(
2733
 
2734
  ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
2735
 
2736
- ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
2737
- ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
 
2738
  }
2739
 
2740
  #ifdef GGML_USE_METAL
@@ -2861,7 +2756,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2861
 
2862
  // create a name -> tensor map of the model to accelerate lookups
2863
  std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2864
- for (auto & kv: model.tensors_by_name) {
2865
  model_tensors.insert(kv);
2866
  }
2867
 
@@ -2872,7 +2767,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2872
  llama_buffer base_buf;
2873
  if (path_base_model) {
2874
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2875
- model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
2876
 
2877
  size_t ctx_size;
2878
  size_t mmapped_size;
@@ -2890,7 +2785,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2890
 
2891
  // maybe this should in llama_model_loader
2892
  if (model_loader->use_mmap) {
2893
- model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
2894
  }
2895
  }
2896
 
@@ -2951,7 +2846,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2951
  return false;
2952
  }
2953
  }
2954
- ggml_tensor* lora_tensor;
2955
  if (n_dims == 2) {
2956
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2957
  }
@@ -2959,6 +2854,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2959
  fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2960
  return 1;
2961
  }
 
2962
 
2963
  // load tensor data
2964
  size_t offset = fin.tellg();
@@ -2974,6 +2870,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
2974
  lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2975
 
2976
  ggml_tensor * dest_t = model_tensors[base_name];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2977
  ggml_tensor * base_t;
2978
  if (model_loader) {
2979
  // load from base model
@@ -3001,7 +2912,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3001
  }
3002
 
3003
  ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
 
 
 
3004
  ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
 
 
3005
 
3006
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
3007
  fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
@@ -3011,19 +2927,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
3011
 
3012
  // w = w + BA*s
3013
  ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
 
 
3014
 
3015
  if (scaling != 1.0f) {
3016
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
 
 
3017
  BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
 
 
3018
  }
3019
 
3020
  ggml_tensor * r;
3021
  if (base_t == dest_t) {
3022
  r = ggml_add_inplace(lora_ctx, dest_t, BA);
 
 
3023
  }
3024
  else {
3025
  r = ggml_add(lora_ctx, base_t, BA);
 
 
 
3026
  r = ggml_cpy(lora_ctx, r, dest_t);
 
 
3027
  }
3028
 
3029
  struct ggml_cgraph gf = ggml_build_forward(r);
@@ -3078,8 +3007,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3078
 
3079
  #define LLAMA_MAX_RNG_STATE (64*1024)
3080
 
3081
- void llama_set_rng_seed(struct llama_context * ctx, int seed) {
3082
- if (seed < 0) {
3083
  seed = time(NULL);
3084
  }
3085
  ctx->rng.seed(seed);
@@ -3408,7 +3337,29 @@ int llama_eval(
3408
  int n_tokens,
3409
  int n_past,
3410
  int n_threads) {
3411
- if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3412
  fprintf(stderr, "%s: failed to eval\n", __func__);
3413
  return 1;
3414
  }
@@ -3429,7 +3380,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
3429
 
3430
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3431
 
3432
- if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
3433
  fprintf(stderr, "%s: failed to eval\n", __func__);
3434
  return 1;
3435
  }
 
12
  #include "ggml.h"
13
  #ifdef GGML_USE_CUBLAS
14
  #include "ggml-cuda.h"
15
+ #endif
16
+ #if defined(GGML_USE_CLBLAST)
17
  #include "ggml-opencl.h"
18
  #endif
19
 
 
22
  #endif
23
  #ifdef GGML_USE_K_QUANTS
24
  #ifndef QK_K
25
+ #ifdef GGML_QKK_64
26
+ #define QK_K 64
27
+ #else
28
  #define QK_K 256
29
  #endif
30
  #endif
31
+ #endif
32
 
33
  #include <array>
34
  #include <ctime>
 
67
  MODEL_65B,
68
  };
69
 
70
+ static const size_t kB = 1024;
71
  static const size_t MB = 1024*1024;
72
 
73
  // computed for n_ctx == 2048
 
131
  return k_sizes;
132
  }
133
 
134
+ // amount of VRAM needed per batch size to hold temporary results
135
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
136
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
137
+ {
138
+ static std::map<e_model, size_t> k_sizes = {
139
+ { MODEL_3B, 512ull * kB },
140
+ { MODEL_7B, 512ull * kB },
141
+ { MODEL_13B, 640ull * kB },
142
+ { MODEL_30B, 768ull * kB },
143
+ { MODEL_65B, 1536ull * kB },
144
+ };
145
+ return k_sizes;
146
+ }
147
+
148
+ // amount of VRAM needed per batch size and context to hold temporary results
149
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
150
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
151
+ {
152
+ static std::map<e_model, size_t> k_sizes = {
153
+ { MODEL_3B, 128ull },
154
+ { MODEL_7B, 128ull },
155
+ { MODEL_13B, 160ull },
156
+ { MODEL_30B, 208ull },
157
+ { MODEL_65B, 416ull },
158
+ };
159
+ return k_sizes;
160
+ }
161
+
162
  // default hparams (LLaMA 7B)
163
  struct llama_hparams {
164
  uint32_t n_vocab = 32000;
 
394
  return size / ggml_blck_size(type);
395
  }
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  struct llama_load_tensor {
 
 
398
  std::string name;
399
  enum ggml_type type = GGML_TYPE_F32;
 
400
  std::vector<uint32_t> ne;
401
+ size_t file_off;
402
  size_t size;
403
  struct ggml_tensor * ggml_tensor = NULL;
404
  uint8_t * data;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  };
406
 
407
  struct llama_load_tensors_map {
 
424
  llama_hparams hparams;
425
  llama_vocab vocab;
426
 
427
+ llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
428
  : file(fname, "rb") {
429
  fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
430
  read_magic();
431
  read_hparams();
432
  read_vocab();
433
+ read_tensor_metadata(tensors_map);
434
  }
435
  void read_magic() {
436
  uint32_t magic = file.read_u32();
 
487
  tok_score.score = score;
488
  }
489
  }
490
+ void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
491
  while (file.tell() < file.size) {
492
+ llama_load_tensor tensor;
493
  uint32_t n_dims = file.read_u32();
494
  uint32_t name_len = file.read_u32();
495
+ tensor.type = (enum ggml_type) file.read_u32();
496
+ tensor.ne.resize(n_dims);
497
+ file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
498
  std::string name = file.read_string(name_len);
499
  if (n_dims < 1 || n_dims > 2) {
500
  throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
501
  }
502
+ switch (tensor.type) {
503
  case GGML_TYPE_F32:
504
  case GGML_TYPE_F16:
505
  case GGML_TYPE_Q4_0:
 
514
  case GGML_TYPE_Q6_K:
515
  break;
516
  default: {
517
+ throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
518
  }
519
  }
520
 
521
+ // skip to the next multiple of 32 bytes
522
+ file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
 
 
 
 
523
 
524
+ tensor.file_off = file.tell();
525
+ tensor.name = name;
526
+ tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
527
+ file.seek(tensor.size, SEEK_CUR);
528
 
529
+ tensors_map.tensors.push_back(tensor);
530
+ tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
 
 
 
 
 
 
 
 
531
  }
532
  }
533
  };
 
597
  };
598
 
599
  struct llama_model_loader {
600
+ std::unique_ptr<llama_file_loader> file_loader;
601
  llama_load_tensors_map tensors_map;
602
  bool use_mmap;
603
  size_t num_ggml_tensors_created = 0;
604
  struct ggml_context * ggml_ctx = NULL;
605
  std::unique_ptr<llama_mmap> mapping;
606
 
607
+ llama_model_loader(const std::string & fname_base, bool use_mmap) {
608
+ file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
 
 
 
 
 
 
 
 
 
 
609
  if (!llama_mmap::SUPPORTED) {
610
  use_mmap = false;
611
  }
 
 
 
 
612
  this->use_mmap = use_mmap;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  }
614
 
615
  void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
 
675
  }
676
 
677
  if (use_mmap) {
678
+ mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
679
  if (lmlock) {
680
  lmlock->init(mapping->addr);
681
  }
 
731
 
732
  void load_data_for(llama_load_tensor & lt) {
733
  if (use_mmap) {
734
+ lt.data = (uint8_t *) mapping->addr + lt.file_off;
735
+ } else {
736
+ llama_file & file = file_loader->file;
737
+ file.seek(lt.file_off, SEEK_SET);
 
738
  file.read_raw(lt.data, lt.size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739
  }
740
+
741
  if (0) {
742
  print_checksum(lt);
743
  }
 
807
 
808
  struct llama_context_params llama_context_default_params() {
809
  struct llama_context_params result = {
810
+ /*.seed =*/ LLAMA_DEFAULT_SEED,
811
  /*.n_ctx =*/ 512,
812
  /*.n_batch =*/ 512,
813
  /*.gpu_layers =*/ 0,
 
846
  return llama_mlock::SUPPORTED;
847
  }
848
 
849
+ void llama_init_backend(bool numa) {
850
  ggml_time_init();
851
 
852
  // needed to initialize f16 tables
 
855
  struct ggml_context * ctx = ggml_init(params);
856
  ggml_free(ctx);
857
  }
858
+
859
+ if (numa) {
860
+ ggml_numa_init();
861
+ }
862
  }
863
 
864
  int64_t llama_time_us() {
 
936
 
937
  model.t_start_us = ggml_time_us();
938
 
939
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
940
 
941
+ vocab = std::move(ml->file_loader->vocab);
942
+ model.hparams = ml->file_loader->hparams;
943
  model.n_gpu_layers = n_gpu_layers;
944
+ llama_file_version file_version = ml->file_loader->file_version;
945
  auto & hparams = model.hparams;
946
 
947
  {
 
975
  fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
976
  fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
977
  fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
 
978
  fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
979
  }
980
 
 
1121
  const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
1122
 
1123
  // this is the total memory required to run the inference
1124
+ const size_t bigctxmul = (hparams.n_ctx>2048?2:1);
1125
  const size_t mem_required =
1126
  ctx_size +
1127
  mmapped_size - vram_weights + // weights in VRAM not in memory
1128
+ MEM_REQ_SCRATCH0().at(model.type)*bigctxmul +
1129
+ MEM_REQ_SCRATCH1().at(model.type)*bigctxmul +
1130
  MEM_REQ_EVAL().at (model.type);
1131
 
1132
  // this is the memory required by one llama_state
 
1143
  fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1144
  ggml_cuda_set_scratch_size(0); // disable scratch
1145
  } else {
1146
+ const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1147
+ const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1148
+ vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1149
  ggml_cuda_set_scratch_size(vram_scratch);
1150
  if (n_gpu_layers > 0) {
1151
+ fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1152
+ __func__, vram_scratch_base / kB, vram_scratch_per_context,
1153
+ (vram_scratch + MB - 1) / MB); // round up
1154
  }
1155
  }
1156
  #endif // GGML_USE_CUBLAS
 
1241
 
1242
  // evaluate the transformer
1243
  //
1244
+ // - lctx: llama context
1245
+ // - tokens: new batch of tokens to process
1246
+ // - embd embeddings input
1247
+ // - n_tokens number of tokens
1248
+ // - n_past: the context size so far
1249
+ // - n_threads: number of threads to use
1250
  //
1251
  static bool llama_eval_internal(
1252
+ llama_context & lctx,
1253
+ const llama_token * tokens,
1254
+ const float * embd,
1255
+ const int n_tokens,
1256
+ const int n_past,
1257
+ const int n_threads,
1258
  const char * cgraph_fname) {
1259
 
1260
  // // enforce that the first token is BOS
 
1298
  ggml_cgraph gf = {};
1299
  gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1300
 
 
 
 
 
1301
  struct ggml_tensor * cur;
1302
+ struct ggml_tensor * inpL;
1303
+
1304
+ if (tokens) {
1305
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1306
+ ggml_set_name(embd, "embd");
1307
+ memcpy(embd->data, tokens, N*ggml_element_size(embd));
1308
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1309
+ } else {
1310
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
1311
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
1312
+ }
1313
 
1314
  const int i_gpu_start = n_layer - n_gpu_layers;
1315
  (void) i_gpu_start;
 
1371
  offload_func_kq(tmpq);
1372
  ggml_set_name(tmpq, "tmpq");
1373
 
1374
+ struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
1375
  offload_func_kq(Kcur);
1376
  ggml_set_name(Kcur, "Kcur");
1377
 
1378
+ struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
1379
  offload_func_kq(Qcur);
1380
  ggml_set_name(Qcur, "Qcur");
1381
 
 
2331
  nthread = std::thread::hardware_concurrency();
2332
  }
2333
 
2334
+ std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
2335
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);
 
2336
 
2337
  #ifdef GGML_USE_K_QUANTS
2338
  int n_attention_wv = 0;
 
2357
  std::vector<std::thread> workers;
2358
  std::mutex mutex;
2359
 
2360
+ auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
2361
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
2362
+ };
2363
+
2364
  size_t idx = 0;
2365
  for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
2366
  llama_buffer read_data;
 
2415
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2416
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2417
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2418
+ use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
2419
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
2420
+ (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
2421
  ++i_attention_wv;
2422
  } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2423
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2424
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2425
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2426
+ use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
2427
+ //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
2428
  ++i_feed_forward_w2;
2429
  } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2430
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
 
2533
  }
2534
  }
2535
 
2536
+
2537
+
2538
  //
2539
  // interface implementation
2540
  //
 
2573
 
2574
  llama_context * ctx = new llama_context(*model, model->vocab);
2575
 
2576
+ if (params.seed == LLAMA_DEFAULT_SEED) {
2577
  params.seed = time(NULL);
2578
  }
2579
 
 
2627
 
2628
  ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
2629
 
2630
+ const size_t bigctxmul = (hparams.n_ctx>2048?2:1);
2631
+ ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);
2632
+ ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
2633
  }
2634
 
2635
  #ifdef GGML_USE_METAL
 
2756
 
2757
  // create a name -> tensor map of the model to accelerate lookups
2758
  std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
2759
+ for (const auto & kv: model.tensors_by_name) {
2760
  model_tensors.insert(kv);
2761
  }
2762
 
 
2767
  llama_buffer base_buf;
2768
  if (path_base_model) {
2769
  fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
2770
+ model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
2771
 
2772
  size_t ctx_size;
2773
  size_t mmapped_size;
 
2785
 
2786
  // maybe this should in llama_model_loader
2787
  if (model_loader->use_mmap) {
2788
+ model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
2789
  }
2790
  }
2791
 
 
2846
  return false;
2847
  }
2848
  }
2849
+ ggml_tensor * lora_tensor;
2850
  if (n_dims == 2) {
2851
  lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
2852
  }
 
2854
  fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
2855
  return 1;
2856
  }
2857
+ ggml_set_name(lora_tensor, "lora_tensor");
2858
 
2859
  // load tensor data
2860
  size_t offset = fin.tellg();
 
2870
  lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
2871
 
2872
  ggml_tensor * dest_t = model_tensors[base_name];
2873
+
2874
+ offload_func_t offload_func = llama_nop;
2875
+ offload_func_t offload_func_force_inplace = llama_nop;
2876
+
2877
+ #ifdef GGML_USE_CUBLAS
2878
+ if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
2879
+ if (dest_t->type != GGML_TYPE_F16) {
2880
+ throw std::runtime_error(format(
2881
+ "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
2882
+ }
2883
+ offload_func = ggml_cuda_assign_buffers;
2884
+ offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
2885
+ }
2886
+ #endif // GGML_USE_CUBLAS
2887
+
2888
  ggml_tensor * base_t;
2889
  if (model_loader) {
2890
  // load from base model
 
2912
  }
2913
 
2914
  ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
2915
+ GGML_ASSERT(loraA->type == GGML_TYPE_F32);
2916
+ ggml_set_name(loraA, "loraA");
2917
+
2918
  ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
2919
+ GGML_ASSERT(loraB->type == GGML_TYPE_F32);
2920
+ ggml_set_name(loraB, "loraB");
2921
 
2922
  if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
2923
  fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
 
2927
 
2928
  // w = w + BA*s
2929
  ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
2930
+ offload_func(BA);
2931
+ ggml_set_name(BA, "BA");
2932
 
2933
  if (scaling != 1.0f) {
2934
  ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
2935
+ ggml_set_name(scale_tensor, "scale_tensor");
2936
+
2937
  BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
2938
+ offload_func(BA);
2939
+ ggml_set_name(BA, "BA_scaled");
2940
  }
2941
 
2942
  ggml_tensor * r;
2943
  if (base_t == dest_t) {
2944
  r = ggml_add_inplace(lora_ctx, dest_t, BA);
2945
+ offload_func_force_inplace(r);
2946
+ ggml_set_name(r, "r_add_inplace");
2947
  }
2948
  else {
2949
  r = ggml_add(lora_ctx, base_t, BA);
2950
+ offload_func(r);
2951
+ ggml_set_name(r, "r_add");
2952
+
2953
  r = ggml_cpy(lora_ctx, r, dest_t);
2954
+ offload_func(r);
2955
+ ggml_set_name(r, "r_cpy");
2956
  }
2957
 
2958
  struct ggml_cgraph gf = ggml_build_forward(r);
 
3007
 
3008
  #define LLAMA_MAX_RNG_STATE (64*1024)
3009
 
3010
+ void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
3011
+ if (seed == LLAMA_DEFAULT_SEED) {
3012
  seed = time(NULL);
3013
  }
3014
  ctx->rng.seed(seed);
 
3337
  int n_tokens,
3338
  int n_past,
3339
  int n_threads) {
3340
+ if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
3341
+ fprintf(stderr, "%s: failed to eval\n", __func__);
3342
+ return 1;
3343
+ }
3344
+
3345
+ // get a more accurate load time, upon first eval
3346
+ // TODO: fix this
3347
+ if (!ctx->has_evaluated_once) {
3348
+ ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
3349
+ ctx->has_evaluated_once = true;
3350
+ }
3351
+
3352
+ return 0;
3353
+ }
3354
+
3355
+
3356
+ int llama_eval_embd(
3357
+ struct llama_context * ctx,
3358
+ const float * embd,
3359
+ int n_tokens,
3360
+ int n_past,
3361
+ int n_threads) {
3362
+ if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
3363
  fprintf(stderr, "%s: failed to eval\n", __func__);
3364
  return 1;
3365
  }
 
3380
 
3381
  const std::vector<llama_token> tmp(n_batch, llama_token_bos());
3382
 
3383
+ if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
3384
  fprintf(stderr, "%s: failed to eval\n", __func__);
3385
  return 1;
3386
  }
llama.h CHANGED
@@ -46,6 +46,8 @@
46
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
47
  #define LLAMA_SESSION_VERSION 1
48
 
 
 
49
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
50
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
51
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -81,11 +83,11 @@ extern "C" {
81
  typedef void (*llama_progress_callback)(float progress, void *ctx);
82
 
83
  struct llama_context_params {
84
- int seed; // RNG seed, -1 for random
85
- int n_ctx; // text context
86
- int n_batch; // prompt processing batch size
87
- int n_gpu_layers; // number of layers to store in VRAM
88
- int main_gpu; // the GPU that is used for scratch and small tensors
89
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
90
  // called with a progress value between 0 and 1, pass NULL to disable
91
  llama_progress_callback progress_callback;
@@ -140,8 +142,9 @@ extern "C" {
140
 
141
  // TODO: not great API - very likely to change
142
  // Initialize the llama + ggml backend
 
143
  // Call once at the start of the program
144
- LLAMA_API void llama_init_backend();
145
 
146
  LLAMA_API int64_t llama_time_us();
147
 
@@ -195,7 +198,7 @@ extern "C" {
195
  LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
196
 
197
  // Sets the current rng seed.
198
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
199
 
200
  // Returns the maximum size in bytes of the state (rng, logits, embedding
201
  // and kv_cache) - will often be smaller after compacting tokens
@@ -225,6 +228,14 @@ extern "C" {
225
  int n_past,
226
  int n_threads);
227
 
 
 
 
 
 
 
 
 
228
  // Export a static computation graph for context of 511 and batch size of 1
229
  // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
230
  // parameters here to keep things simple
 
46
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
47
  #define LLAMA_SESSION_VERSION 1
48
 
49
+ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
50
+
51
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
52
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
 
83
  typedef void (*llama_progress_callback)(float progress, void *ctx);
84
 
85
  struct llama_context_params {
86
+ uint32_t seed; // RNG seed, -1 for random
87
+ int32_t n_ctx; // text context
88
+ int32_t n_batch; // prompt processing batch size
89
+ int32_t n_gpu_layers; // number of layers to store in VRAM
90
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
92
  // called with a progress value between 0 and 1, pass NULL to disable
93
  llama_progress_callback progress_callback;
 
142
 
143
  // TODO: not great API - very likely to change
144
  // Initialize the llama + ggml backend
145
+ // If numa is true, use NUMA optimizations
146
  // Call once at the start of the program
147
+ LLAMA_API void llama_init_backend(bool numa);
148
 
149
  LLAMA_API int64_t llama_time_us();
150
 
 
198
  LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
199
 
200
  // Sets the current rng seed.
201
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
202
 
203
  // Returns the maximum size in bytes of the state (rng, logits, embedding
204
  // and kv_cache) - will often be smaller after compacting tokens
 
228
  int n_past,
229
  int n_threads);
230
 
231
+ // Same as llama_eval, but use float matrix input directly.
232
+ LLAMA_API int llama_eval_embd(
233
+ struct llama_context * ctx,
234
+ const float * embd,
235
+ int n_tokens,
236
+ int n_past,
237
+ int n_threads);
238
+
239
  // Export a static computation graph for context of 511 and batch size of 1
240
  // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
241
  // parameters here to keep things simple
make_old_pyinstaller_cuda.bat CHANGED
@@ -1,4 +1,4 @@
1
  echo This file is only for my own usage, please do not use it. I am lazy.
2
 
3
  set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH%
4
- PyInstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./cublas64_11.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cudart64_110.dll;." --add-data "./msvcp140.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe"
 
1
  echo This file is only for my own usage, please do not use it. I am lazy.
2
 
3
  set PATH=d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python;d:\\MainApplications\\KoboldAIGPT\\KoboldAI-Horde-Bridge\\python\\Scripts;%PATH%
4
+ PyInstaller --noconfirm --onefile --clean --console --icon "./nikogreen.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_failsafe.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." --add-data "./koboldcpp_cublas.dll;." --add-data "./cublas64_11.dll;." --add-data "./cublasLt64_11.dll;." --add-data "./cudart64_110.dll;." --add-data "./msvcp140.dll;." --add-data "./vcruntime140.dll;." --add-data "./vcruntime140_1.dll;." --add-data "./rwkv_vocab.embd;." --add-data "./rwkv_world_vocab.embd;." "./koboldcpp.py" -n "koboldcpp.exe"
msvcp140.dll ADDED
Binary file (580 kB). View file
 
nikogreen.ico ADDED
otherarch/ggml_v2-cuda-legacy.cu ADDED
@@ -0,0 +1,712 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <cstddef>
2
+ #include <cstdint>
3
+ #include <stdint.h>
4
+ #include <stdio.h>
5
+ #include <atomic>
6
+
7
+ #include <cuda_runtime.h>
8
+ #include <cublas_v2.h>
9
+ #include <cuda_fp16.h>
10
+
11
+ #include "ggml_v2-cuda-legacy.h"
12
+ #include "ggml_v2-cuda.h"
13
+ #include "ggml_v2.h"
14
+
15
+ static_assert(sizeof(half) == sizeof(ggml_v2_fp16_t), "wrong fp16 size");
16
+
17
+ #define CUDA_CHECK(err) \
18
+ do { \
19
+ cudaError_t err_ = (err); \
20
+ if (err_ != cudaSuccess) { \
21
+ fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
22
+ cudaGetErrorString(err_)); \
23
+ exit(1); \
24
+ } \
25
+ } while (0)
26
+
27
+ #define CUBLAS_CHECK(err) \
28
+ do { \
29
+ cublasStatus_t err_ = (err); \
30
+ if (err_ != CUBLAS_STATUS_SUCCESS) { \
31
+ fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
32
+ exit(1); \
33
+ } \
34
+ } while (0)
35
+
36
+ typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
37
+
38
+ #define QK4_0 32
39
+ typedef struct {
40
+ float d; // delta
41
+ uint8_t qs[QK4_0 / 2]; // nibbles / quants
42
+ } block_q4_0;
43
+ static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
44
+
45
+ #define QK4_1 32
46
+ typedef struct {
47
+ float d; // delta
48
+ float m; // min
49
+ uint8_t qs[QK4_1 / 2]; // nibbles / quants
50
+ } block_q4_1;
51
+ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
52
+
53
+ #define QK4_2 16
54
+ typedef struct {
55
+ half d; // delta
56
+ uint8_t qs[QK4_2 / 2]; // nibbles / quants
57
+ } block_q4_2;
58
+ static_assert(sizeof(block_q4_2) == sizeof(ggml_v2_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
59
+
60
+ #define QK4_3 16
61
+ typedef struct {
62
+ __half d; // delta
63
+ __half m; // min
64
+ uint8_t qs[QK4_3 / 2]; // nibbles / quants
65
+ } block_q4_3;
66
+ static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_v2_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
67
+
68
+ #define QK5_0 32
69
+ typedef struct {
70
+ half d; // delta
71
+ uint8_t qh[4]; // 5-th bit of quants
72
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
73
+ } block_q5_0;
74
+ static_assert(sizeof(block_q5_0) == sizeof(ggml_v2_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
75
+
76
+ #define QK5_1 32
77
+ typedef struct {
78
+ half d; // delta
79
+ half m; // min
80
+ uint8_t qh[4]; // 5-th bit of quants
81
+ uint8_t qs[QK5_1 / 2]; // nibbles / quants
82
+ } block_q5_1;
83
+ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_v2_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
84
+
85
+ #define QK8_0 32
86
+ typedef struct {
87
+ float d; // delta
88
+ int8_t qs[QK8_0]; // quants
89
+ } block_q8_0;
90
+ static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
91
+
92
+ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
93
+ const block_q4_0 * x = (const block_q4_0 *) vx;
94
+
95
+ const int i = blockIdx.x;
96
+
97
+ const float d = x[i].d;
98
+
99
+ const uint8_t * pp = x[i].qs;
100
+
101
+ for (int l = 0; l < QK4_0; l += 2) {
102
+ const uint8_t vi = pp[l/2];
103
+
104
+ const int8_t vi0 = vi & 0xf;
105
+ const int8_t vi1 = vi >> 4;
106
+
107
+ const float v0 = (vi0 - 8)*d;
108
+ const float v1 = (vi1 - 8)*d;
109
+
110
+ y[i*QK4_0 + l + 0] = v0;
111
+ y[i*QK4_0 + l + 1] = v1;
112
+ }
113
+ }
114
+
115
+ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
116
+ const block_q4_1 * x = (const block_q4_1 *) vx;
117
+
118
+ const int i = blockIdx.x;
119
+
120
+ const float d = x[i].d;
121
+ const float m = x[i].m;
122
+
123
+ const uint8_t * pp = x[i].qs;
124
+
125
+ for (int l = 0; l < QK4_1; l += 2) {
126
+ const uint8_t vi = pp[l/2];
127
+
128
+ const int8_t vi0 = vi & 0xf;
129
+ const int8_t vi1 = vi >> 4;
130
+
131
+ const float v0 = vi0*d + m;
132
+ const float v1 = vi1*d + m;
133
+
134
+ y[i*QK4_1 + l + 0] = v0;
135
+ y[i*QK4_1 + l + 1] = v1;
136
+ }
137
+ }
138
+
139
+ static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
140
+ const block_q4_2 * x = (const block_q4_2 *) vx;
141
+
142
+ const int i = blockIdx.x;
143
+
144
+ const float d = x[i].d;
145
+
146
+ const uint8_t * pp = x[i].qs;
147
+
148
+ for (int l = 0; l < QK4_2; l += 2) {
149
+ const uint8_t vi = pp[l/2];
150
+
151
+ const int8_t vi0 = vi & 0xf;
152
+ const int8_t vi1 = vi >> 4;
153
+
154
+ const float v0 = (vi0 - 8)*d;
155
+ const float v1 = (vi1 - 8)*d;
156
+
157
+ y[i*QK4_2 + l + 0] = v0;
158
+ y[i*QK4_2 + l + 1] = v1;
159
+ }
160
+ }
161
+
162
+ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
163
+ const block_q4_3 * x = (const block_q4_3 *) vx;
164
+
165
+ const int i = blockIdx.x;
166
+
167
+ const float d = x[i].d;
168
+ const float m = x[i].m;
169
+
170
+ const uint8_t * pp = x[i].qs;
171
+
172
+ for (int l = 0; l < QK4_3; l += 2) {
173
+ const uint8_t vi = pp[l/2];
174
+
175
+ const int8_t vi0 = vi & 0xf;
176
+ const int8_t vi1 = vi >> 4;
177
+
178
+ const float v0 = vi0*d + m;
179
+ const float v1 = vi1*d + m;
180
+
181
+ y[i*QK4_3 + l + 0] = v0;
182
+ y[i*QK4_3 + l + 1] = v1;
183
+ }
184
+ }
185
+
186
+ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
187
+ const block_q5_0 * x = (const block_q5_0 *) vx;
188
+
189
+ const int i = blockIdx.x;
190
+
191
+ const float d = x[i].d;
192
+
193
+ const uint8_t * pp = x[i].qs;
194
+
195
+ uint32_t qh;
196
+ memcpy(&qh, x[i].qh, sizeof(qh));
197
+
198
+ for (int l = 0; l < QK5_0; l += 2) {
199
+ const uint8_t vi = pp[l/2];
200
+
201
+ const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
202
+ const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
203
+
204
+ const int8_t vi0 = ((vi & 0xf) | vh0);
205
+ const int8_t vi1 = ((vi >> 4) | vh1);
206
+
207
+ const float v0 = (vi0 - 16)*d;
208
+ const float v1 = (vi1 - 16)*d;
209
+
210
+ y[i*QK5_0 + l + 0] = v0;
211
+ y[i*QK5_0 + l + 1] = v1;
212
+ }
213
+ }
214
+
215
+ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
216
+ const block_q5_1 * x = (const block_q5_1 *) vx;
217
+
218
+ const int i = blockIdx.x;
219
+
220
+ const float d = x[i].d;
221
+ const float m = x[i].m;
222
+
223
+ const uint8_t * pp = x[i].qs;
224
+
225
+ uint32_t qh;
226
+ memcpy(&qh, x[i].qh, sizeof(qh));
227
+
228
+ for (int l = 0; l < QK5_1; l += 2) {
229
+ const uint8_t vi = pp[l/2];
230
+
231
+ const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
232
+ const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
233
+
234
+ const int8_t vi0 = (vi & 0xf) | vh0;
235
+ const int8_t vi1 = (vi >> 4) | vh1;
236
+
237
+ const float v0 = vi0*d + m;
238
+ const float v1 = vi1*d + m;
239
+
240
+ y[i*QK5_1 + l + 0] = v0;
241
+ y[i*QK5_1 + l + 1] = v1;
242
+ }
243
+ }
244
+
245
+ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
246
+ const block_q8_0 * x = (const block_q8_0 *) vx;
247
+
248
+ const int i = blockIdx.x;
249
+
250
+ const float d = x[i].d;
251
+
252
+ const int8_t * pp = x[i].qs;
253
+
254
+ for (int l = 0; l < QK8_0; l++) {
255
+ const int8_t vi = pp[l];
256
+
257
+ y[i*QK8_0 + l] = vi*d;
258
+ }
259
+ }
260
+
261
+ static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
262
+ const int nb = k / QK4_0;
263
+ dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
264
+ }
265
+
266
+ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
267
+ const int nb = k / QK4_1;
268
+ dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
269
+ }
270
+
271
+ static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
272
+ const int nb = k / QK4_2;
273
+ dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
274
+ }
275
+
276
+ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
277
+ const int nb = k / QK4_3;
278
+ dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
279
+ }
280
+
281
+ static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
282
+ const int nb = k / QK5_0;
283
+ dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
284
+ }
285
+
286
+ static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
287
+ const int nb = k / QK5_1;
288
+ dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
289
+ }
290
+
291
+ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
292
+ const int nb = k / QK8_0;
293
+ dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
294
+ }
295
+
296
+ // TODO: optimize
297
+ static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
298
+ const half * x = (const half *) vx;
299
+
300
+ const int i = blockIdx.x;
301
+
302
+ y[i] = __half2float(x[i]);
303
+ }
304
+
305
+ static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
306
+ convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
307
+ }
308
+
309
+ static to_fp32_cuda_t ggml_v2_get_to_fp32_cuda(ggml_v2_type type) {
310
+ switch (type) {
311
+ case GGML_V2_TYPE_Q4_0:
312
+ return dequantize_row_q4_0_cuda;
313
+ case GGML_V2_TYPE_Q4_1:
314
+ return dequantize_row_q4_1_cuda;
315
+ case GGML_V2_TYPE_Q4_2:
316
+ return dequantize_row_q4_2_cuda;
317
+ case GGML_V2_TYPE_Q4_3:
318
+ return dequantize_row_q4_3_cuda;
319
+ case GGML_V2_TYPE_Q5_0:
320
+ return dequantize_row_q5_0_cuda;
321
+ case GGML_V2_TYPE_Q5_1:
322
+ return dequantize_row_q5_1_cuda;
323
+ case GGML_V2_TYPE_Q8_0:
324
+ return dequantize_row_q8_0_cuda;
325
+ case GGML_V2_TYPE_F16:
326
+ return convert_fp16_to_fp32_cuda;
327
+ default:
328
+ return nullptr;
329
+ }
330
+ }
331
+
332
+ // buffer pool for cuda
333
+ #define MAX_CUDA_BUFFERS 16
334
+
335
+ struct scoped_spin_lock {
336
+ std::atomic_flag& lock;
337
+ scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
338
+ while (lock.test_and_set(std::memory_order_acquire)) {
339
+ ; // spin
340
+ }
341
+ }
342
+ ~scoped_spin_lock() {
343
+ lock.clear(std::memory_order_release);
344
+ }
345
+ scoped_spin_lock(const scoped_spin_lock&) = delete;
346
+ scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
347
+ };
348
+
349
+ struct cuda_buffer {
350
+ void * ptr = nullptr;
351
+ size_t size = 0;
352
+ };
353
+
354
+ static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
355
+ static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
356
+
357
+ static void * ggml_v2_cuda_pool_malloc(size_t size, size_t * actual_size) {
358
+ scoped_spin_lock lock(g_cuda_pool_lock);
359
+
360
+ for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
361
+ cuda_buffer& b = g_cuda_buffer_pool[i];
362
+ if (b.size >= size && b.ptr != nullptr) {
363
+ void * ptr = b.ptr;
364
+ *actual_size = b.size;
365
+ b.ptr = nullptr;
366
+ b.size = 0;
367
+ return ptr;
368
+ }
369
+ }
370
+ void * ptr;
371
+ CUDA_CHECK(cudaMalloc((void **) &ptr, size));
372
+ *actual_size = size;
373
+ return ptr;
374
+ }
375
+
376
+ static void ggml_v2_cuda_pool_free(void * ptr, size_t size) {
377
+ scoped_spin_lock lock(g_cuda_pool_lock);
378
+
379
+ for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
380
+ cuda_buffer& b = g_cuda_buffer_pool[i];
381
+ if (b.ptr == nullptr) {
382
+ b.ptr = ptr;
383
+ b.size = size;
384
+ return;
385
+ }
386
+ }
387
+ fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
388
+ CUDA_CHECK(cudaFree(ptr));
389
+ }
390
+
391
+ #define GGML_V2_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
392
+ #define GGML_V2_CUDA_MAX_EVENTS 64
393
+ static cublasHandle_t g_cublasH = nullptr;
394
+ static cudaStream_t g_cudaStreams[GGML_V2_CUDA_MAX_STREAMS] = { nullptr };
395
+ static cudaStream_t g_cudaStreams2[GGML_V2_CUDA_MAX_STREAMS] = { nullptr };
396
+ static cudaEvent_t g_cudaEvents[GGML_V2_CUDA_MAX_EVENTS] = { nullptr };
397
+
398
+ void ggml_v2_init_cublas_legacy() {
399
+ if (g_cublasH == nullptr) {
400
+ // create streams
401
+ for (int i = 0; i < GGML_V2_CUDA_MAX_STREAMS; ++i) {
402
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking));
403
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking));
404
+ }
405
+ // create events
406
+ for (int i = 0; i < GGML_V2_CUDA_MAX_EVENTS; ++i) {
407
+ CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming));
408
+ }
409
+
410
+ // create cublas handle
411
+ CUBLAS_CHECK(cublasCreate(&g_cublasH));
412
+ CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
413
+
414
+ // configure logging to stdout
415
+ // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
416
+ }
417
+ }
418
+
419
+
420
+
421
+ static cudaError_t ggml_v2_cuda_h2d_tensor_2d(void * dst, const struct ggml_v2_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
422
+ const uint64_t ne0 = src->ne[0];
423
+ const uint64_t ne1 = src->ne[1];
424
+ const uint64_t nb0 = src->nb[0];
425
+ const uint64_t nb1 = src->nb[1];
426
+ const uint64_t nb2 = src->nb[2];
427
+ const uint64_t nb3 = src->nb[3];
428
+ const enum ggml_v2_type type = src->type;
429
+ const size_t ts = ggml_v2_type_size(type);
430
+ const size_t bs = ggml_v2_blck_size(type);
431
+
432
+ const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
433
+ if (nb0 == ts && nb1 == ts*ne0/bs) {
434
+ return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
435
+ } else if (nb0 == ts) {
436
+ return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
437
+ } else {
438
+ for (uint64_t i1 = 0; i1 < ne1; i1++) {
439
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
440
+ void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
441
+ // pretend the row is a matrix with cols=1
442
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
443
+ if (r != cudaSuccess) return r;
444
+ }
445
+ return cudaSuccess;
446
+ }
447
+ }
448
+
449
+ static void ggml_v2_cuda_mul_mat_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) {
450
+ const int64_t ne00 = src0->ne[0];
451
+ const int64_t ne01 = src0->ne[1];
452
+ const int64_t ne02 = src0->ne[2];
453
+ const int64_t ne03 = src0->ne[3];
454
+
455
+ const int64_t ne10 = src1->ne[0];
456
+ const int64_t ne11 = src1->ne[1];
457
+
458
+ const int nb2 = dst->nb[2];
459
+ const int nb3 = dst->nb[3];
460
+
461
+ const float alpha = 1.0f;
462
+ const float beta = 0.0f;
463
+ const int x_ne = ne01 * ne00;
464
+ const int y_ne = ne11 * ne10;
465
+ const int d_ne = ne11 * ne01;
466
+ const int n_mm = ne03 * ne02;
467
+
468
+ size_t x_size, y_size, d_size;
469
+ float * d_X = (float *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
470
+ float * d_Y = (float *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
471
+ float * d_D = (float *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
472
+
473
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
474
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
475
+ int i = i03*ne02 + i02;
476
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_V2_CUDA_MAX_STREAMS];
477
+
478
+ float * c_X = d_X + i * x_ne;
479
+ float * c_Y = d_Y + i * y_ne;
480
+ float * c_D = d_D + i * d_ne;
481
+
482
+ // copy data to device
483
+ CUDA_CHECK(ggml_v2_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
484
+ CUDA_CHECK(ggml_v2_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
485
+
486
+ // compute
487
+ CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
488
+ CUBLAS_CHECK(
489
+ cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
490
+ ne01, ne11, ne10,
491
+ &alpha, c_X, ne00,
492
+ c_Y, ne10,
493
+ &beta, c_D, ne01));
494
+
495
+ // copy dst to host
496
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
497
+ CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
498
+ }
499
+ }
500
+
501
+ CUDA_CHECK(cudaDeviceSynchronize());
502
+ ggml_v2_cuda_pool_free(d_X, x_size);
503
+ ggml_v2_cuda_pool_free(d_Y, y_size);
504
+ ggml_v2_cuda_pool_free(d_D, d_size);
505
+ }
506
+
507
+ static void ggml_v2_cuda_mul_mat_f16(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst, void * wdata, size_t /* wsize */) {
508
+ const int64_t ne00 = src0->ne[0];
509
+ const int64_t ne01 = src0->ne[1];
510
+ const int64_t ne02 = src0->ne[2];
511
+ const int64_t ne03 = src0->ne[3];
512
+
513
+ const int64_t ne10 = src1->ne[0];
514
+ const int64_t ne11 = src1->ne[1];
515
+
516
+ const int nb10 = src1->nb[0];
517
+ const int nb11 = src1->nb[1];
518
+ const int nb12 = src1->nb[2];
519
+ const int nb13 = src1->nb[3];
520
+
521
+ const int nb2 = dst->nb[2];
522
+ const int nb3 = dst->nb[3];
523
+
524
+ const float alpha = 1.0f;
525
+ const float beta = 0.0f;
526
+ const int x_ne = ne01 * ne00;
527
+ const int y_ne = ne11 * ne10;
528
+ const int d_ne = ne11 * ne01;
529
+ const int n_mm = ne03 * ne02;
530
+
531
+ size_t x_size, y_size, d_size;
532
+ half * d_X = (half *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
533
+ half * d_Y = (half *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
534
+ float * d_D = (float *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
535
+
536
+ bool src1_cont_rows = nb10 == sizeof(float);
537
+ bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
538
+
539
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
540
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
541
+ int i = i03*ne02 + i02;
542
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_V2_CUDA_MAX_STREAMS];
543
+
544
+ half * c_X = d_X + i * x_ne;
545
+ half * c_Y = d_Y + i * y_ne;
546
+ float * c_D = d_D + i * d_ne;
547
+
548
+ // copy src0 to device
549
+ CUDA_CHECK(ggml_v2_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
550
+
551
+ // convert src1 to fp16
552
+ // TODO: use multiple threads
553
+ ggml_v2_fp16_t * const tmp = (ggml_v2_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
554
+ char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
555
+ if (src1_cont_rows) {
556
+ if (src1_cont_cols) {
557
+ ggml_v2_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
558
+ }
559
+ else {
560
+ for (int64_t i01 = 0; i01 < ne11; i01++) {
561
+ ggml_v2_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
562
+ }
563
+ }
564
+ }
565
+ else {
566
+ for (int64_t i01 = 0; i01 < ne11; i01++) {
567
+ for (int64_t i00 = 0; i00 < ne10; i00++) {
568
+ // very slow due to no inlining
569
+ tmp[i01*ne10 + i00] = ggml_v2_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
570
+ }
571
+ }
572
+ }
573
+
574
+ // copy src1 to device
575
+ CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
576
+
577
+ // compute
578
+ CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
579
+ CUBLAS_CHECK(
580
+ cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
581
+ ne01, ne11, ne10,
582
+ &alpha, c_X, CUDA_R_16F, ne00,
583
+ c_Y, CUDA_R_16F, ne10,
584
+ &beta, c_D, CUDA_R_32F, ne01,
585
+ CUBLAS_COMPUTE_32F_FAST_16F,
586
+ CUBLAS_GEMM_DEFAULT));
587
+
588
+ // copy dst to host
589
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
590
+ CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
591
+ }
592
+ }
593
+
594
+ CUDA_CHECK(cudaDeviceSynchronize());
595
+ ggml_v2_cuda_pool_free(d_X, x_size);
596
+ ggml_v2_cuda_pool_free(d_Y, y_size);
597
+ ggml_v2_cuda_pool_free(d_D, d_size);
598
+ }
599
+
600
+ static void ggml_v2_cuda_mul_mat_q_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) {
601
+ const int64_t ne00 = src0->ne[0];
602
+ const int64_t ne01 = src0->ne[1];
603
+ const int64_t ne02 = src0->ne[2];
604
+ const int64_t ne03 = src0->ne[3];
605
+
606
+ const int64_t ne10 = src1->ne[0];
607
+ const int64_t ne11 = src1->ne[1];
608
+
609
+ const int nb2 = dst->nb[2];
610
+ const int nb3 = dst->nb[3];
611
+ const ggml_v2_type type = src0->type;
612
+
613
+ const float alpha = 1.0f;
614
+ const float beta = 0.0f;
615
+ const int x_ne = ne01 * ne00;
616
+ const int y_ne = ne11 * ne10;
617
+ const int d_ne = ne11 * ne01;
618
+ const int n_mm = ne03 * ne02;
619
+ const size_t q_sz = ggml_v2_type_size(type) * x_ne / ggml_v2_blck_size(type);
620
+
621
+ size_t x_size, y_size, d_size, q_size;
622
+ float * d_X = (float *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
623
+ float * d_Y = (float *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
624
+ float * d_D = (float *) ggml_v2_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
625
+ char * d_Q = (char *) ggml_v2_cuda_pool_malloc(n_mm * q_sz, &q_size);
626
+
627
+ const to_fp32_cuda_t to_fp32_cuda = ggml_v2_get_to_fp32_cuda(type);
628
+ GGML_V2_ASSERT(to_fp32_cuda != nullptr);
629
+
630
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
631
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
632
+ int i = i03*ne02 + i02;
633
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_V2_CUDA_MAX_STREAMS];
634
+ cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_V2_CUDA_MAX_STREAMS];
635
+ cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_V2_CUDA_MAX_EVENTS];
636
+
637
+ float * c_X = d_X + i * x_ne;
638
+ float * c_Y = d_Y + i * y_ne;
639
+ float * c_D = d_D + i * d_ne;
640
+ char * c_Q = d_Q + i * q_sz;
641
+
642
+ // copy src0 and convert to fp32 on device
643
+ CUDA_CHECK(ggml_v2_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
644
+ to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
645
+ CUDA_CHECK(cudaGetLastError());
646
+ CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
647
+
648
+ // copy src1 to device
649
+ CUDA_CHECK(ggml_v2_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
650
+
651
+ // wait for conversion
652
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
653
+
654
+ // compute
655
+ CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
656
+ CUBLAS_CHECK(
657
+ cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
658
+ ne01, ne11, ne10,
659
+ &alpha, c_X, ne00,
660
+ c_Y, ne10,
661
+ &beta, c_D, ne01));
662
+
663
+ // copy dst to host
664
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
665
+ CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
666
+ }
667
+ }
668
+
669
+ CUDA_CHECK(cudaDeviceSynchronize());
670
+ ggml_v2_cuda_pool_free(d_X, x_size);
671
+ ggml_v2_cuda_pool_free(d_Y, y_size);
672
+ ggml_v2_cuda_pool_free(d_D, d_size);
673
+ ggml_v2_cuda_pool_free(d_Q, q_size);
674
+ }
675
+
676
+ static bool ggml_v2_cuda_mul_mat_use_f16(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * /* dst */) {
677
+ size_t src0_sz = ggml_v2_nbytes(src0);
678
+ size_t src1_sz = ggml_v2_nbytes(src1);
679
+
680
+ // mul_mat_q: src0 is converted to fp32 on device
681
+ size_t mul_mat_q_transfer = src0_sz + src1_sz;
682
+
683
+ // mul_mat_f16: src1 is converted to fp16 on cpu
684
+ size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_v2_nelements(src1);
685
+
686
+ // choose the smaller one to transfer to the device
687
+ // TODO: this is not always the best choice due to the overhead of converting to fp16
688
+ return mul_mat_f16_transfer < mul_mat_q_transfer;
689
+ }
690
+
691
+ void ggml_v2_cuda_mul_mat_legacy(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst, void * wdata, size_t wsize) {
692
+ GGML_V2_ASSERT(ggml_v2_cuda_can_mul_mat(src0, src1, dst));
693
+
694
+ if (src0->type == GGML_V2_TYPE_F32) {
695
+ ggml_v2_cuda_mul_mat_f32(src0, src1, dst);
696
+ }
697
+ else if (src0->type == GGML_V2_TYPE_F16) {
698
+ if (ggml_v2_cuda_mul_mat_use_f16(src0, src1, dst)) {
699
+ ggml_v2_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
700
+ }
701
+ else {
702
+ ggml_v2_cuda_mul_mat_q_f32(src0, src1, dst);
703
+ }
704
+ }
705
+ else if (ggml_v2_is_quantized(src0->type)) {
706
+ ggml_v2_cuda_mul_mat_q_f32(src0, src1, dst);
707
+ }
708
+ else {
709
+ GGML_V2_ASSERT(false);
710
+ }
711
+ }
712
+
otherarch/ggml_v2-cuda-legacy.h ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml_v2.h"
2
+
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ void ggml_v2_init_cublas_legacy(void);
8
+
9
+ void ggml_v2_cuda_mul_mat_legacy(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst, void * wdata, size_t wsize);
10
+
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif